From 3376bcdb4d81416ecce97fe5628b6adf112ebed2 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Mon, 5 Jan 2026 15:28:13 +0000 Subject: [PATCH 1/3] Issue/887 - Add pow,div,mod,min,max operator with CPU and NVIDIA implementations. --- include/infiniop.h | 3 + include/infiniop/ops/div.h | 26 +++ include/infiniop/ops/max.h | 26 +++ include/infiniop/ops/min.h | 26 +++ include/infiniop/ops/mod.h | 26 +++ include/infiniop/ops/pow.h | 26 +++ src/infiniop/ops/div/cpu/div_cpu.cc | 50 +++++ src/infiniop/ops/div/cpu/div_cpu.h | 19 ++ src/infiniop/ops/div/cuda/kernel.cuh | 23 +++ src/infiniop/ops/div/nvidia/div_nvidia.cu | 57 ++++++ src/infiniop/ops/div/nvidia/div_nvidia.cuh | 8 + src/infiniop/ops/div/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/max/cpu/max_cpu.cc | 50 +++++ src/infiniop/ops/max/cpu/max_cpu.h | 20 ++ src/infiniop/ops/max/cuda/kernel.cuh | 23 +++ src/infiniop/ops/max/nvidia/max_nvidia.cu | 57 ++++++ src/infiniop/ops/max/nvidia/max_nvidia.cuh | 8 + src/infiniop/ops/max/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/min/cpu/min_cpu.cc | 50 +++++ src/infiniop/ops/min/cpu/min_cpu.h | 20 ++ src/infiniop/ops/min/cuda/kernel.cuh | 23 +++ src/infiniop/ops/min/nvidia/min_nvidia.cu | 57 ++++++ src/infiniop/ops/min/nvidia/min_nvidia.cuh | 8 + src/infiniop/ops/min/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/mod/cpu/mod_cpu.cc | 49 +++++ src/infiniop/ops/mod/cpu/mod_cpu.h | 23 +++ src/infiniop/ops/mod/cuda/kernel.cuh | 30 +++ src/infiniop/ops/mod/nvidia/mod_nvidia.cu | 57 ++++++ src/infiniop/ops/mod/nvidia/mod_nvidia.cuh | 8 + src/infiniop/ops/mod/operator.cc | 142 +++++++++++++++ src/infiniop/ops/pow/cpu/pow_cpu.cc | 49 +++++ src/infiniop/ops/pow/cpu/pow_cpu.h | 19 ++ src/infiniop/ops/pow/cuda/kernel.cuh | 40 ++++ src/infiniop/ops/pow/nvidia/pow_nvidia.cu | 57 ++++++ src/infiniop/ops/pow/nvidia/pow_nvidia.cuh | 8 + src/infiniop/ops/pow/operator.cc | 142 +++++++++++++++ test/infiniop/div.py | 192 ++++++++++++++++++++ test/infiniop/libinfiniop/op_register.py | 170 +++++++++++++++++ test/infiniop/max.py | 189 +++++++++++++++++++ test/infiniop/min.py | 189 +++++++++++++++++++ test/infiniop/mod.py | 190 +++++++++++++++++++ test/infiniop/pow.py | 190 +++++++++++++++++++ 42 files changed, 2956 insertions(+) create mode 100644 include/infiniop/ops/div.h create mode 100644 include/infiniop/ops/max.h create mode 100644 include/infiniop/ops/min.h create mode 100644 include/infiniop/ops/mod.h create mode 100644 include/infiniop/ops/pow.h create mode 100644 src/infiniop/ops/div/cpu/div_cpu.cc create mode 100644 src/infiniop/ops/div/cpu/div_cpu.h create mode 100644 src/infiniop/ops/div/cuda/kernel.cuh create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cu create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cuh create mode 100644 src/infiniop/ops/div/operator.cc create mode 100644 src/infiniop/ops/max/cpu/max_cpu.cc create mode 100644 src/infiniop/ops/max/cpu/max_cpu.h create mode 100644 src/infiniop/ops/max/cuda/kernel.cuh create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cu create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cuh create mode 100644 src/infiniop/ops/max/operator.cc create mode 100644 src/infiniop/ops/min/cpu/min_cpu.cc create mode 100644 src/infiniop/ops/min/cpu/min_cpu.h create mode 100644 src/infiniop/ops/min/cuda/kernel.cuh create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cu create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cuh create mode 100644 src/infiniop/ops/min/operator.cc create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.cc create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.h create mode 100644 src/infiniop/ops/mod/cuda/kernel.cuh create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cu create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh create mode 100644 src/infiniop/ops/mod/operator.cc create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.cc create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.h create mode 100644 src/infiniop/ops/pow/cuda/kernel.cuh create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cu create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh create mode 100644 src/infiniop/ops/pow/operator.cc create mode 100644 test/infiniop/div.py create mode 100644 test/infiniop/max.py create mode 100644 test/infiniop/min.py create mode 100644 test/infiniop/mod.py create mode 100644 test/infiniop/pow.py diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..cf1688868 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -9,11 +9,14 @@ #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/div.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/max.h" +#include "infiniop/ops/min.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h new file mode 100644 index 000000000..e539b440c --- /dev/null +++ b/include/infiniop/ops/div.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_DIV_API_H__ +#define __INFINIOP_DIV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h new file mode 100644 index 000000000..e6f2f5d4c --- /dev/null +++ b/include/infiniop/ops/max.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MAX_API_H__ +#define __INFINIOP_MAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h new file mode 100644 index 000000000..f72f0c4db --- /dev/null +++ b/include/infiniop/ops/min.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MIN_API_H__ +#define __INFINIOP_MIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h new file mode 100644 index 000000000..5a6cd5bbf --- /dev/null +++ b/include/infiniop/ops/mod.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MOD_API_H__ +#define __INFINIOP_MOD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopModDescriptor_t; + +__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h new file mode 100644 index 000000000..6449d8622 --- /dev/null +++ b/include/infiniop/ops/pow.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_POW_API_H__ +#define __INFINIOP_POW_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopPowDescriptor_t; + +__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc new file mode 100644 index 000000000..19e222031 --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -0,0 +1,50 @@ +#include "div_cpu.h" + +namespace op::div::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h new file mode 100644 index 000000000..0373b766f --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -0,0 +1,19 @@ +#ifndef __DIV_CPU_H__ +#define __DIV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(div, cpu) + +namespace op::div::cpu { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a / b; + } +} DivOp; +} // namespace op::div::cpu + +#endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh new file mode 100644 index 000000000..a67993da5 --- /dev/null +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __DIV_CUDA_H__ +#define __DIV_CUDA_H__ + +namespace op::div::cuda { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a / b; + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + } +} DivOp; +} // namespace op::div::cuda + +#endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu new file mode 100644 index 000000000..1abffe816 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "div_nvidia.cuh" + +namespace op::div::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh new file mode 100644 index 000000000..1ad8af94e --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __DIV_CUDA_API_H__ +#define __DIV_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(div, nvidia) + +#endif // __DIV_CUDA_API_H__ diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc new file mode 100644 index 000000000..84021a1af --- /dev/null +++ b/src/infiniop/ops/div/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/div.h" + +#ifdef ENABLE_CPU_API +#include "cpu/div_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/div_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/div_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/div_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/div_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/div_moore.h" +#endif + +__C infiniStatus_t infiniopCreateDivDescriptor( + infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::div::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDiv( + infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc new file mode 100644 index 000000000..1b30fa4e4 --- /dev/null +++ b/src/infiniop/ops/max/cpu/max_cpu.cc @@ -0,0 +1,50 @@ +#include "max_cpu.h" + +namespace op::max::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::max::cpu diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h new file mode 100644 index 000000000..4d085ed39 --- /dev/null +++ b/src/infiniop/ops/max/cpu/max_cpu.h @@ -0,0 +1,20 @@ +#ifndef __MAX_CPU_H__ +#define __MAX_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(max, cpu) + +namespace op::max::cpu { +typedef struct MaxOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::max(a, b); + } +} MaxOp; +} // namespace op::max::cpu + +#endif // __MAX_CPU_H__ diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh new file mode 100644 index 000000000..bf3977a31 --- /dev/null +++ b/src/infiniop/ops/max/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __MAX_CUDA_H__ +#define __MAX_CUDA_H__ + +namespace op::max::cuda { +typedef struct MaxOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a > b ? a : b; + } else if constexpr (std::is_same_v) { + return fmaxf(a, b); + } else { + return a > b ? a : b; + } + } +} MaxOp; +} // namespace op::max::cuda + +#endif // __MAX_CUDA_H__ diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu new file mode 100644 index 000000000..5e9fb13f4 --- /dev/null +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "max_nvidia.cuh" + +namespace op::max::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::max::nvidia diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cuh b/src/infiniop/ops/max/nvidia/max_nvidia.cuh new file mode 100644 index 000000000..b3b60dd2a --- /dev/null +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAX_CUDA_API_H__ +#define __MAX_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(max, nvidia) + +#endif // __MAX_CUDA_API_H__ diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc new file mode 100644 index 000000000..e04368533 --- /dev/null +++ b/src/infiniop/ops/max/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/max.h" + +#ifdef ENABLE_CPU_API +#include "cpu/max_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/max_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/max_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/max_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/max_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/max_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMaxDescriptor( + infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMax( + infiniopMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc new file mode 100644 index 000000000..dc30ee57f --- /dev/null +++ b/src/infiniop/ops/min/cpu/min_cpu.cc @@ -0,0 +1,50 @@ +#include "min_cpu.h" + +namespace op::min::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::min::cpu diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h new file mode 100644 index 000000000..1c84d4fca --- /dev/null +++ b/src/infiniop/ops/min/cpu/min_cpu.h @@ -0,0 +1,20 @@ +#ifndef __MIN_CPU_H__ +#define __MIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(min, cpu) + +namespace op::min::cpu { +typedef struct MinOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::min(a, b); + } +} MinOp; +} // namespace op::min::cpu + +#endif // __MIN_CPU_H__ diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh new file mode 100644 index 000000000..aac14a0e8 --- /dev/null +++ b/src/infiniop/ops/min/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __MIN_CUDA_H__ +#define __MIN_CUDA_H__ + +namespace op::min::cuda { +typedef struct MinOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a < b ? a : b; + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} MinOp; +} // namespace op::min::cuda + +#endif // __MIN_CUDA_H__ diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu new file mode 100644 index 000000000..419655e29 --- /dev/null +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "min_nvidia.cuh" + +namespace op::min::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::min::nvidia diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cuh b/src/infiniop/ops/min/nvidia/min_nvidia.cuh new file mode 100644 index 000000000..ada9a3545 --- /dev/null +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MIN_CUDA_API_H__ +#define __MIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(min, nvidia) + +#endif // __MIN_CUDA_API_H__ diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc new file mode 100644 index 000000000..8479feab4 --- /dev/null +++ b/src/infiniop/ops/min/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/min.h" + +#ifdef ENABLE_CPU_API +#include "cpu/min_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/min_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/min_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/min_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/min_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/min_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMinDescriptor( + infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::min::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMin( + infiniopMinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc new file mode 100644 index 000000000..907d05166 --- /dev/null +++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc @@ -0,0 +1,49 @@ +#include "mod_cpu.h" + +namespace op::mod::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &out_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::mod::cpu diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h new file mode 100644 index 000000000..9e78adca6 --- /dev/null +++ b/src/infiniop/ops/mod/cpu/mod_cpu.h @@ -0,0 +1,23 @@ +#ifndef __MOD_CPU_H__ +#define __MOD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(mod, cpu) + +namespace op::mod::cpu { +typedef struct ModOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } +} ModOp; +} // namespace op::mod::cpu + +#endif // __MOD_CPU_H__ diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh new file mode 100644 index 000000000..0dcb54136 --- /dev/null +++ b/src/infiniop/ops/mod/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __MOD_CUDA_H__ +#define __MOD_CUDA_H__ + +#include +#include + +namespace op::mod::cuda { +typedef struct ModOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(std::fmod(a_, b_)); + } else if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } +} ModOp; +} // namespace op::mod::cuda + +#endif // __MOD_CUDA_H__ diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu new file mode 100644 index 000000000..64326d441 --- /dev/null +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "mod_nvidia.cuh" + +namespace op::mod::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::mod::nvidia diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh new file mode 100644 index 000000000..31788cfd2 --- /dev/null +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MOD_CUDA_API_H__ +#define __MOD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(mod, nvidia) + +#endif // __MOD_CUDA_API_H__ diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc new file mode 100644 index 000000000..85810e794 --- /dev/null +++ b/src/infiniop/ops/mod/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/mod.h" + +#ifdef ENABLE_CPU_API +#include "cpu/mod_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/mod_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateModDescriptor( + infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::mod::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMod( + infiniopModDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc new file mode 100644 index 000000000..0c6fda0f7 --- /dev/null +++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc @@ -0,0 +1,49 @@ +#include "pow_cpu.h" + +namespace op::pow::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &out_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::pow::cpu diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h new file mode 100644 index 000000000..21d9bb897 --- /dev/null +++ b/src/infiniop/ops/pow/cpu/pow_cpu.h @@ -0,0 +1,19 @@ +#ifndef __POW_CPU_H__ +#define __POW_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(pow, cpu) + +namespace op::pow::cpu { +typedef struct PowOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::pow(a, b); + } +} PowOp; +} // namespace op::pow::cpu + +#endif // __POW_CPU_H__ diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh new file mode 100644 index 000000000..e8b5324a0 --- /dev/null +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -0,0 +1,40 @@ +#ifndef __POW_CUDA_H__ +#define __POW_CUDA_H__ + +#include +#include +#include + +namespace op::pow::cuda { +typedef struct PowOp { + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(__powf(a_, b_)); + } else if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } + } +} PowOp; + +} // namespace op::pow::cuda + +#endif // __POW_CUDA_H__ diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu new file mode 100644 index 000000000..3cfd0cd2f --- /dev/null +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "pow_nvidia.cuh" + +namespace op::pow::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::pow::nvidia diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh new file mode 100644 index 000000000..5bbb2fb8c --- /dev/null +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __POW_CUDA_API_H__ +#define __POW_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(pow, nvidia) + +#endif // __POW_CUDA_API_H__ diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc new file mode 100644 index 000000000..e90639f67 --- /dev/null +++ b/src/infiniop/ops/pow/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/pow.h" + +#ifdef ENABLE_CPU_API +#include "cpu/pow_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/pow_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreatePowDescriptor( + infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::pow::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopPow( + infiniopPowDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/div.py b/test/infiniop/div.py new file mode 100644 index 000000000..17b22b2e5 --- /dev/null +++ b/test/infiniop/div.py @@ -0,0 +1,192 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.div(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + # For division, ensure b doesn't contain zeros to avoid division by zero + # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2) + # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero + b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_div() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 618be2b05..a61cea018 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -269,6 +269,176 @@ def mul_(lib): ] +@OpRegister.operator +def pow_(lib): + lib.infiniopCreatePowDescriptor.restype = c_int32 + lib.infiniopCreatePowDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPowWorkspaceSize.restype = c_int32 + lib.infiniopGetPowWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPow.restype = c_int32 + lib.infiniopPow.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPowDescriptor.restype = c_int32 + lib.infiniopDestroyPowDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def div_(lib): + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def mod_(lib): + lib.infiniopCreateModDescriptor.restype = c_int32 + lib.infiniopCreateModDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetModWorkspaceSize.restype = c_int32 + lib.infiniopGetModWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMod.restype = c_int32 + lib.infiniopMod.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_(lib): + lib.infiniopCreateMaxDescriptor.restype = c_int32 + lib.infiniopCreateMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMax.restype = c_int32 + lib.infiniopMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMaxDescriptor.restype = c_int32 + lib.infiniopDestroyMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def min_(lib): + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMinWorkspaceSize.restype = c_int32 + lib.infiniopGetMinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def random_sample_(lib): lib.infiniopCreateRandomSampleDescriptor.restype = c_int32 diff --git a/test/infiniop/max.py b/test/infiniop/max.py new file mode 100644 index 000000000..e4221cf3e --- /dev/null +++ b/test/infiniop/max.py @@ -0,0 +1,189 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def max(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.maximum(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_max(): + check_error( + LIBINFINIOP.infiniopMax( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_max() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/min.py b/test/infiniop/min.py new file mode 100644 index 000000000..19f52a334 --- /dev/null +++ b/test/infiniop/min.py @@ -0,0 +1,189 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def min(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.minimum(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMinDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_min(): + check_error( + LIBINFINIOP.infiniopMin( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_min() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py new file mode 100644 index 000000000..298f3137f --- /dev/null +++ b/test/infiniop/mod.py @@ -0,0 +1,190 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: mod operation uses fmod for floating point, which should be exact +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def mod_op(c, a, b): + torch.fmod(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + # Generate test tensors with values in a reasonable range for mod operation + # Use scale=10 to get values in [0, 10) range, similar to old test + a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0) + # Ensure b doesn't contain zeros to avoid division by zero in mod + b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateModDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetModWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_mod(): + check_error( + LIBINFINIOP.infiniopMod( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_mod() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py new file mode 100644 index 000000000..f437c4229 --- /dev/null +++ b/test/infiniop/pow.py @@ -0,0 +1,190 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +# Note: Only F16 and F32 are supported, matching the old repository's binary operator +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: pow operation may have larger numerical errors, especially for F16 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def pow_op(c, a, b): + torch.pow(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + # Generate test tensors with values in a reasonable range for pow operation + # Avoid negative bases and very large exponents to prevent numerical issues + a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1) + b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePowDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPowWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_pow(): + check_error( + LIBINFINIOP.infiniopPow( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_pow() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + # Use equal_nan=True to handle NaN cases in pow operation + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 547c2bc3a79aacc9c94222b9e26c967a4c2c6364 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Wed, 7 Jan 2026 02:34:11 +0000 Subject: [PATCH 2/3] Issue/887 - Add abs,acos,acosh,asin,asinh,atan,atanh,ceil,cos,cosh,erf,floor,log,neg,reciprocal,round,sign,sinh,sqrt,tan operator with CPU and NVIDIA implementations. --- include/infiniop.h | 20 + include/infiniop/ops/abs.h | 24 + include/infiniop/ops/acos.h | 24 + include/infiniop/ops/acosh.h | 24 + include/infiniop/ops/asin.h | 24 + include/infiniop/ops/asinh.h | 24 + include/infiniop/ops/atan.h | 24 + include/infiniop/ops/atanh.h | 24 + include/infiniop/ops/ceil.h | 24 + include/infiniop/ops/cos.h | 24 + include/infiniop/ops/cosh.h | 24 + include/infiniop/ops/erf.h | 24 + include/infiniop/ops/floor.h | 24 + include/infiniop/ops/log.h | 24 + include/infiniop/ops/neg.h | 24 + include/infiniop/ops/reciprocal.h | 24 + include/infiniop/ops/round.h | 24 + include/infiniop/ops/sign.h | 24 + include/infiniop/ops/sinh.h | 24 + include/infiniop/ops/sqrt.h | 24 + include/infiniop/ops/tan.h | 24 + src/infiniop/ops/abs/cpu/abs_cpu.cc | 48 ++ src/infiniop/ops/abs/cpu/abs_cpu.h | 26 + src/infiniop/ops/abs/cuda/kernel.cuh | 26 + src/infiniop/ops/abs/nvidia/abs_nvidia.cu | 54 ++ src/infiniop/ops/abs/nvidia/abs_nvidia.cuh | 8 + src/infiniop/ops/abs/operator.cc | 139 +++++ src/infiniop/ops/acos/cpu/acos_cpu.cc | 48 ++ src/infiniop/ops/acos/cpu/acos_cpu.h | 22 + src/infiniop/ops/acos/cuda/kernel.cuh | 32 + src/infiniop/ops/acos/nvidia/acos_nvidia.cu | 54 ++ src/infiniop/ops/acos/nvidia/acos_nvidia.cuh | 8 + src/infiniop/ops/acos/operator.cc | 139 +++++ src/infiniop/ops/acosh/cpu/acosh_cpu.cc | 48 ++ src/infiniop/ops/acosh/cpu/acosh_cpu.h | 22 + src/infiniop/ops/acosh/cuda/kernel.cuh | 32 + src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu | 54 ++ .../ops/acosh/nvidia/acosh_nvidia.cuh | 8 + src/infiniop/ops/acosh/operator.cc | 139 +++++ src/infiniop/ops/asin/cpu/asin_cpu.cc | 48 ++ src/infiniop/ops/asin/cpu/asin_cpu.h | 22 + src/infiniop/ops/asin/cuda/kernel.cuh | 32 + src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 54 ++ src/infiniop/ops/asin/nvidia/asin_nvidia.cuh | 8 + src/infiniop/ops/asin/operator.cc | 139 +++++ src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 48 ++ src/infiniop/ops/asinh/cpu/asinh_cpu.h | 22 + src/infiniop/ops/asinh/cuda/kernel.cuh | 32 + src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 54 ++ .../ops/asinh/nvidia/asinh_nvidia.cuh | 8 + src/infiniop/ops/asinh/operator.cc | 139 +++++ src/infiniop/ops/atan/cpu/atan_cpu.cc | 48 ++ src/infiniop/ops/atan/cpu/atan_cpu.h | 22 + src/infiniop/ops/atan/cuda/kernel.cuh | 32 + src/infiniop/ops/atan/nvidia/atan_nvidia.cu | 54 ++ src/infiniop/ops/atan/nvidia/atan_nvidia.cuh | 8 + src/infiniop/ops/atan/operator.cc | 139 +++++ src/infiniop/ops/atanh/cpu/atanh_cpu.cc | 48 ++ src/infiniop/ops/atanh/cpu/atanh_cpu.h | 22 + src/infiniop/ops/atanh/cuda/kernel.cuh | 32 + src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu | 54 ++ .../ops/atanh/nvidia/atanh_nvidia.cuh | 8 + src/infiniop/ops/atanh/operator.cc | 139 +++++ src/infiniop/ops/ceil/cpu/ceil_cpu.cc | 48 ++ src/infiniop/ops/ceil/cpu/ceil_cpu.h | 26 + src/infiniop/ops/ceil/cuda/kernel.cuh | 34 + src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu | 54 ++ src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh | 8 + src/infiniop/ops/ceil/operator.cc | 139 +++++ src/infiniop/ops/cos/cpu/cos_cpu.cc | 48 ++ src/infiniop/ops/cos/cpu/cos_cpu.h | 22 + src/infiniop/ops/cos/cuda/kernel.cuh | 32 + src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 54 ++ src/infiniop/ops/cos/nvidia/cos_nvidia.cuh | 8 + src/infiniop/ops/cos/operator.cc | 139 +++++ src/infiniop/ops/cosh/cpu/cosh_cpu.cc | 48 ++ src/infiniop/ops/cosh/cpu/cosh_cpu.h | 22 + src/infiniop/ops/cosh/cuda/kernel.cuh | 32 + src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu | 54 ++ src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh | 8 + src/infiniop/ops/cosh/operator.cc | 139 +++++ src/infiniop/ops/erf/cpu/erf_cpu.cc | 48 ++ src/infiniop/ops/erf/cpu/erf_cpu.h | 22 + src/infiniop/ops/erf/cuda/kernel.cuh | 32 + src/infiniop/ops/erf/nvidia/erf_nvidia.cu | 54 ++ src/infiniop/ops/erf/nvidia/erf_nvidia.cuh | 8 + src/infiniop/ops/erf/operator.cc | 139 +++++ src/infiniop/ops/floor/cpu/floor_cpu.cc | 48 ++ src/infiniop/ops/floor/cpu/floor_cpu.h | 26 + src/infiniop/ops/floor/cuda/kernel.cuh | 34 + src/infiniop/ops/floor/nvidia/floor_nvidia.cu | 54 ++ .../ops/floor/nvidia/floor_nvidia.cuh | 8 + src/infiniop/ops/floor/operator.cc | 139 +++++ src/infiniop/ops/log/cpu/log_cpu.cc | 48 ++ src/infiniop/ops/log/cpu/log_cpu.h | 22 + src/infiniop/ops/log/cuda/kernel.cuh | 32 + src/infiniop/ops/log/nvidia/log_nvidia.cu | 54 ++ src/infiniop/ops/log/nvidia/log_nvidia.cuh | 8 + src/infiniop/ops/log/operator.cc | 139 +++++ src/infiniop/ops/neg/cpu/neg_cpu.cc | 48 ++ src/infiniop/ops/neg/cpu/neg_cpu.h | 20 + src/infiniop/ops/neg/cuda/kernel.cuh | 23 + src/infiniop/ops/neg/nvidia/neg_nvidia.cu | 54 ++ src/infiniop/ops/neg/nvidia/neg_nvidia.cuh | 8 + src/infiniop/ops/neg/operator.cc | 139 +++++ src/infiniop/ops/pow/cuda/kernel.cuh | 2 +- .../ops/reciprocal/cpu/reciprocal_cpu.cc | 48 ++ .../ops/reciprocal/cpu/reciprocal_cpu.h | 20 + src/infiniop/ops/reciprocal/cuda/kernel.cuh | 32 + .../reciprocal/nvidia/reciprocal_nvidia.cu | 54 ++ .../reciprocal/nvidia/reciprocal_nvidia.cuh | 8 + src/infiniop/ops/reciprocal/operator.cc | 139 +++++ src/infiniop/ops/round/cpu/round_cpu.cc | 48 ++ src/infiniop/ops/round/cpu/round_cpu.h | 25 + src/infiniop/ops/round/cuda/kernel.cuh | 34 + src/infiniop/ops/round/nvidia/round_nvidia.cu | 54 ++ .../ops/round/nvidia/round_nvidia.cuh | 8 + src/infiniop/ops/round/operator.cc | 139 +++++ src/infiniop/ops/sign/cpu/sign_cpu.cc | 48 ++ src/infiniop/ops/sign/cpu/sign_cpu.h | 20 + src/infiniop/ops/sign/cuda/kernel.cuh | 25 + src/infiniop/ops/sign/nvidia/sign_nvidia.cu | 54 ++ src/infiniop/ops/sign/nvidia/sign_nvidia.cuh | 8 + src/infiniop/ops/sign/operator.cc | 139 +++++ src/infiniop/ops/sinh/cpu/sinh_cpu.cc | 48 ++ src/infiniop/ops/sinh/cpu/sinh_cpu.h | 22 + src/infiniop/ops/sinh/cuda/kernel.cuh | 32 + src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu | 54 ++ src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh | 8 + src/infiniop/ops/sinh/operator.cc | 139 +++++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 48 ++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.h | 22 + src/infiniop/ops/sqrt/cuda/kernel.cuh | 32 + src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu | 54 ++ src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh | 8 + src/infiniop/ops/sqrt/operator.cc | 139 +++++ src/infiniop/ops/tan/cpu/tan_cpu.cc | 48 ++ src/infiniop/ops/tan/cpu/tan_cpu.h | 22 + src/infiniop/ops/tan/cuda/kernel.cuh | 55 ++ src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 54 ++ src/infiniop/ops/tan/nvidia/tan_nvidia.cuh | 8 + src/infiniop/ops/tan/operator.cc | 139 +++++ test/infiniop/abs.py | 164 +++++ test/infiniop/acos.py | 165 +++++ test/infiniop/acosh.py | 165 +++++ test/infiniop/asin.py | 165 +++++ test/infiniop/asinh.py | 165 +++++ test/infiniop/atan.py | 164 +++++ test/infiniop/atanh.py | 165 +++++ test/infiniop/ceil.py | 165 +++++ test/infiniop/cos.py | 166 +++++ test/infiniop/cosh.py | 165 +++++ test/infiniop/erf.py | 165 +++++ test/infiniop/floor.py | 165 +++++ test/infiniop/libinfiniop/op_register.py | 583 ++++++++++++++++++ test/infiniop/log.py | 166 +++++ test/infiniop/neg.py | 165 +++++ test/infiniop/reciprocal.py | 168 +++++ test/infiniop/round.py | 165 +++++ test/infiniop/sign.py | 166 +++++ test/infiniop/sinh.py | 166 +++++ test/infiniop/sqrt.py | 166 +++++ test/infiniop/tan.py | 167 +++++ 163 files changed, 10468 insertions(+), 1 deletion(-) create mode 100644 include/infiniop/ops/abs.h create mode 100644 include/infiniop/ops/acos.h create mode 100644 include/infiniop/ops/acosh.h create mode 100644 include/infiniop/ops/asin.h create mode 100644 include/infiniop/ops/asinh.h create mode 100644 include/infiniop/ops/atan.h create mode 100644 include/infiniop/ops/atanh.h create mode 100644 include/infiniop/ops/ceil.h create mode 100644 include/infiniop/ops/cos.h create mode 100644 include/infiniop/ops/cosh.h create mode 100644 include/infiniop/ops/erf.h create mode 100644 include/infiniop/ops/floor.h create mode 100644 include/infiniop/ops/log.h create mode 100644 include/infiniop/ops/neg.h create mode 100644 include/infiniop/ops/reciprocal.h create mode 100644 include/infiniop/ops/round.h create mode 100644 include/infiniop/ops/sign.h create mode 100644 include/infiniop/ops/sinh.h create mode 100644 include/infiniop/ops/sqrt.h create mode 100644 include/infiniop/ops/tan.h create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.cc create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.h create mode 100644 src/infiniop/ops/abs/cuda/kernel.cuh create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cu create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh create mode 100644 src/infiniop/ops/abs/operator.cc create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.cc create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.h create mode 100644 src/infiniop/ops/acos/cuda/kernel.cuh create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cu create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh create mode 100644 src/infiniop/ops/acos/operator.cc create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.cc create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.h create mode 100644 src/infiniop/ops/acosh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh create mode 100644 src/infiniop/ops/acosh/operator.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.h create mode 100644 src/infiniop/ops/asin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cu create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh create mode 100644 src/infiniop/ops/asin/operator.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.h create mode 100644 src/infiniop/ops/asinh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh create mode 100644 src/infiniop/ops/asinh/operator.cc create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.cc create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.h create mode 100644 src/infiniop/ops/atan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cu create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh create mode 100644 src/infiniop/ops/atan/operator.cc create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.cc create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.h create mode 100644 src/infiniop/ops/atanh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh create mode 100644 src/infiniop/ops/atanh/operator.cc create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.cc create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.h create mode 100644 src/infiniop/ops/ceil/cuda/kernel.cuh create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh create mode 100644 src/infiniop/ops/ceil/operator.cc create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh create mode 100644 src/infiniop/ops/cos/operator.cc create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.cc create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.h create mode 100644 src/infiniop/ops/cosh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh create mode 100644 src/infiniop/ops/cosh/operator.cc create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh create mode 100644 src/infiniop/ops/erf/operator.cc create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.cc create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.h create mode 100644 src/infiniop/ops/floor/cuda/kernel.cuh create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cu create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cuh create mode 100644 src/infiniop/ops/floor/operator.cc create mode 100644 src/infiniop/ops/log/cpu/log_cpu.cc create mode 100644 src/infiniop/ops/log/cpu/log_cpu.h create mode 100644 src/infiniop/ops/log/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cu create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cuh create mode 100644 src/infiniop/ops/log/operator.cc create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.cc create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.h create mode 100644 src/infiniop/ops/neg/cuda/kernel.cuh create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cu create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh create mode 100644 src/infiniop/ops/neg/operator.cc create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h create mode 100644 src/infiniop/ops/reciprocal/cuda/kernel.cuh create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh create mode 100644 src/infiniop/ops/reciprocal/operator.cc create mode 100644 src/infiniop/ops/round/cpu/round_cpu.cc create mode 100644 src/infiniop/ops/round/cpu/round_cpu.h create mode 100644 src/infiniop/ops/round/cuda/kernel.cuh create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cu create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cuh create mode 100644 src/infiniop/ops/round/operator.cc create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.cc create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.h create mode 100644 src/infiniop/ops/sign/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cu create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh create mode 100644 src/infiniop/ops/sign/operator.cc create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.cc create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.h create mode 100644 src/infiniop/ops/sinh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh create mode 100644 src/infiniop/ops/sinh/operator.cc create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h create mode 100644 src/infiniop/ops/sqrt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh create mode 100644 src/infiniop/ops/sqrt/operator.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh create mode 100644 src/infiniop/ops/tan/operator.cc create mode 100644 test/infiniop/abs.py create mode 100644 test/infiniop/acos.py create mode 100644 test/infiniop/acosh.py create mode 100644 test/infiniop/asin.py create mode 100644 test/infiniop/asinh.py create mode 100644 test/infiniop/atan.py create mode 100644 test/infiniop/atanh.py create mode 100644 test/infiniop/ceil.py create mode 100644 test/infiniop/cos.py create mode 100644 test/infiniop/cosh.py create mode 100644 test/infiniop/erf.py create mode 100644 test/infiniop/floor.py create mode 100644 test/infiniop/log.py create mode 100644 test/infiniop/neg.py create mode 100644 test/infiniop/reciprocal.py create mode 100644 test/infiniop/round.py create mode 100644 test/infiniop/sign.py create mode 100644 test/infiniop/sinh.py create mode 100644 test/infiniop/sqrt.py create mode 100644 test/infiniop/tan.py diff --git a/include/infiniop.h b/include/infiniop.h index cf1688868..4778fce90 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,9 +2,21 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" +#include "infiniop/ops/abs.h" +#include "infiniop/ops/acos.h" +#include "infiniop/ops/acosh.h" #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" +#include "infiniop/ops/asin.h" +#include "infiniop/ops/asinh.h" +#include "infiniop/ops/atan.h" +#include "infiniop/ops/atanh.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/ceil.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/cosh.h" +#include "infiniop/ops/erf.h" +#include "infiniop/ops/floor.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" @@ -13,17 +25,24 @@ #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" +#include "infiniop/ops/log.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/max.h" #include "infiniop/ops/min.h" #include "infiniop/ops/mul.h" +#include "infiniop/ops/neg.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/random_sample.h" +#include "infiniop/ops/reciprocal.h" #include "infiniop/ops/rearrange.h" +#include "infiniop/ops/round.h" +#include "infiniop/ops/sign.h" +#include "infiniop/ops/sinh.h" +#include "infiniop/ops/sqrt.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" @@ -33,6 +52,7 @@ #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h new file mode 100644 index 000000000..7b5872657 --- /dev/null +++ b/include/infiniop/ops/abs.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ABS_API_H__ +#define __INFINIOP_ABS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle, + infiniopAbsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h new file mode 100644 index 000000000..fe6af01ed --- /dev/null +++ b/include/infiniop/ops/acos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ACOS_API_H__ +#define __INFINIOP_ACOS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle, + infiniopAcosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h new file mode 100644 index 000000000..be28918bb --- /dev/null +++ b/include/infiniop/ops/acosh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ACOSH_API_H__ +#define __INFINIOP_ACOSH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle, + infiniopAcoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h new file mode 100644 index 000000000..2aac6d1e1 --- /dev/null +++ b/include/infiniop/ops/asin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASIN_API_H__ +#define __INFINIOP_ASIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h new file mode 100644 index 000000000..d1385fc01 --- /dev/null +++ b/include/infiniop/ops/asinh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASINH_API_H__ +#define __INFINIOP_ASINH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h new file mode 100644 index 000000000..3b1a5bde3 --- /dev/null +++ b/include/infiniop/ops/atan.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ATAN_API_H__ +#define __INFINIOP_ATAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle, + infiniopAtanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h new file mode 100644 index 000000000..800afd5d5 --- /dev/null +++ b/include/infiniop/ops/atanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ATANH_API_H__ +#define __INFINIOP_ATANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h new file mode 100644 index 000000000..4539d77fd --- /dev/null +++ b/include/infiniop/ops/ceil.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CEIL_API_H__ +#define __INFINIOP_CEIL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle, + infiniopCeilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..8f0b6eeb7 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h new file mode 100644 index 000000000..3328151ad --- /dev/null +++ b/include/infiniop/ops/cosh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COSH_API_H__ +#define __INFINIOP_COSH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle, + infiniopCoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h new file mode 100644 index 000000000..8cbb8fb74 --- /dev/null +++ b/include/infiniop/ops/erf.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERF_API_H__ +#define __INFINIOP_ERF_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; + +__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h new file mode 100644 index 000000000..2f65f8f4a --- /dev/null +++ b/include/infiniop/ops/floor.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_FLOOR_API_H__ +#define __INFINIOP_FLOOR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle, + infiniopFloorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h new file mode 100644 index 000000000..f5bec4382 --- /dev/null +++ b/include/infiniop/ops/log.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_LOG_API_H__ +#define __INFINIOP_LOG_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle, + infiniopLogDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h new file mode 100644 index 000000000..4d3b06e21 --- /dev/null +++ b/include/infiniop/ops/neg.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_NEG_API_H__ +#define __INFINIOP_NEG_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopNegDescriptor_t; + +__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle, + infiniopNegDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h new file mode 100644 index 000000000..73836fea4 --- /dev/null +++ b/include/infiniop/ops/reciprocal.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_RECIPROCAL_API_H__ +#define __INFINIOP_RECIPROCAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h new file mode 100644 index 000000000..18c7fe44e --- /dev/null +++ b/include/infiniop/ops/round.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ROUND_API_H__ +#define __INFINIOP_ROUND_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t; + +__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle, + infiniopRoundDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h new file mode 100644 index 000000000..fe47c7190 --- /dev/null +++ b/include/infiniop/ops/sign.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIGN_API_H__ +#define __INFINIOP_SIGN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSignDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle, + infiniopSignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h new file mode 100644 index 000000000..a5325fb81 --- /dev/null +++ b/include/infiniop/ops/sinh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SINH_API_H__ +#define __INFINIOP_SINH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle, + infiniopSinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h new file mode 100644 index 000000000..db04ec8bc --- /dev/null +++ b/include/infiniop/ops/sqrt.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SQRT_API_H__ +#define __INFINIOP_SQRT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h new file mode 100644 index 000000000..69fc47bf1 --- /dev/null +++ b/include/infiniop/ops/tan.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TAN_API_H__ +#define __INFINIOP_TAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc new file mode 100644 index 000000000..7d6e81d04 --- /dev/null +++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc @@ -0,0 +1,48 @@ +#include "abs_cpu.h" + +namespace op::abs::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::abs::cpu diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h new file mode 100644 index 000000000..5b9773298 --- /dev/null +++ b/src/infiniop/ops/abs/cpu/abs_cpu.h @@ -0,0 +1,26 @@ +#ifndef __ABS_CPU_H__ +#define __ABS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(abs, cpu) + +namespace op::abs::cpu { +typedef struct AbsOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } +} AbsOp; +} // namespace op::abs::cpu + +#endif // __ABS_CPU_H__ diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh new file mode 100644 index 000000000..d7ff2db12 --- /dev/null +++ b/src/infiniop/ops/abs/cuda/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __ABS_CUDA_H__ +#define __ABS_CUDA_H__ + +#include +#include + +namespace op::abs::cuda { +typedef struct AbsOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __habs2(x); + } else if constexpr (std::is_same_v) { + return __habs(x); + } else if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } +} AbsOp; +} // namespace op::abs::cuda + +#endif // __ABS_CUDA_H__ diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu new file mode 100644 index 000000000..485f0406a --- /dev/null +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "abs_nvidia.cuh" + +namespace op::abs::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::abs::nvidia diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh new file mode 100644 index 000000000..db1751e26 --- /dev/null +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ABS_NVIDIA_API_H__ +#define __ABS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(abs, nvidia) + +#endif // __ABS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc new file mode 100644 index 000000000..b6820079d --- /dev/null +++ b/src/infiniop/ops/abs/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/abs.h" + +#ifdef ENABLE_CPU_API +#include "cpu/abs_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/abs_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAbsDescriptor( + infiniopHandle_t handle, + infiniopAbsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::abs::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAbs( + infiniopAbsDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc new file mode 100644 index 000000000..1accb6752 --- /dev/null +++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc @@ -0,0 +1,48 @@ +#include "acos_cpu.h" + +namespace op::acos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acos::cpu diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h new file mode 100644 index 000000000..14e74b75c --- /dev/null +++ b/src/infiniop/ops/acos/cpu/acos_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ACOS_CPU_H__ +#define __ACOS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(acos, cpu) + +namespace op::acos::cpu { +typedef struct AcosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::acos(x); + } +} AcosOp; +} // namespace op::acos::cpu + +#endif // __ACOS_CPU_H__ diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh new file mode 100644 index 000000000..c3281c7e3 --- /dev/null +++ b/src/infiniop/ops/acos/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ACOS_CUDA_H__ +#define __ACOS_CUDA_H__ + +#include +#include + +namespace op::acos::cuda { +typedef struct AcosOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acosf(x0), acosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acosf(x); + } else { + return std::acos(x); + } + } +} AcosOp; +} // namespace op::acos::cuda + +#endif // __ACOS_CUDA_H__ diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu new file mode 100644 index 000000000..8480219bc --- /dev/null +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "acos_nvidia.cuh" + +namespace op::acos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acos::nvidia diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh new file mode 100644 index 000000000..a7ac7e190 --- /dev/null +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ACOS_NVIDIA_API_H__ +#define __ACOS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(acos, nvidia) + +#endif // __ACOS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc new file mode 100644 index 000000000..e775a005a --- /dev/null +++ b/src/infiniop/ops/acos/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/acos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/acos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/acos_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAcosDescriptor( + infiniopHandle_t handle, + infiniopAcosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::acos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAcos( + infiniopAcosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc new file mode 100644 index 000000000..005463679 --- /dev/null +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc @@ -0,0 +1,48 @@ +#include "acosh_cpu.h" + +namespace op::acosh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acosh::cpu diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h new file mode 100644 index 000000000..b4b710ed5 --- /dev/null +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ACOSH_CPU_H__ +#define __ACOSH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(acosh, cpu) + +namespace op::acosh::cpu { +typedef struct AcoshOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::acosh(x); + } +} AcoshOp; +} // namespace op::acosh::cpu + +#endif // __ACOSH_CPU_H__ diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh new file mode 100644 index 000000000..fe444b1b4 --- /dev/null +++ b/src/infiniop/ops/acosh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ACOSH_CUDA_H__ +#define __ACOSH_CUDA_H__ + +#include +#include + +namespace op::acosh::cuda { +typedef struct AcoshOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acoshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acoshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acoshf(x); + } else { + return std::acosh(x); + } + } +} AcoshOp; +} // namespace op::acosh::cuda + +#endif // __ACOSH_CUDA_H__ diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu new file mode 100644 index 000000000..fc06590a7 --- /dev/null +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "acosh_nvidia.cuh" + +namespace op::acosh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acosh::nvidia diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh new file mode 100644 index 000000000..b13332431 --- /dev/null +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ACOSH_NVIDIA_API_H__ +#define __ACOSH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(acosh, nvidia) + +#endif // __ACOSH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc new file mode 100644 index 000000000..9bba3389a --- /dev/null +++ b/src/infiniop/ops/acosh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/acosh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/acosh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/acosh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAcoshDescriptor( + infiniopHandle_t handle, + infiniopAcoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::acosh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAcosh( + infiniopAcoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc new file mode 100644 index 000000000..e149044f1 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -0,0 +1,48 @@ +#include "asin_cpu.h" + +namespace op::asin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::cpu diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h new file mode 100644 index 000000000..22bcba337 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ASIN_CPU_H__ +#define __ASIN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(asin, cpu) + +namespace op::asin::cpu { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::asin(x); + } +} AsinOp; +} // namespace op::asin::cpu + +#endif // __ASIN_CPU_H__ diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh new file mode 100644 index 000000000..3e8d11a07 --- /dev/null +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ASIN_CUDA_H__ +#define __ASIN_CUDA_H__ + +#include +#include + +namespace op::asin::cuda { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinf(x0), asinf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinf(x); + } else { + return std::asin(x); + } + } +} AsinOp; +} // namespace op::asin::cuda + +#endif // __ASIN_CUDA_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu new file mode 100644 index 000000000..714d2b1b3 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asin_nvidia.cuh" + +namespace op::asin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::nvidia diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh new file mode 100644 index 000000000..46e168ede --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASIN_NVIDIA_API_H__ +#define __ASIN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asin, nvidia) + +#endif // __ASIN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc new file mode 100644 index 000000000..c4973e9f5 --- /dev/null +++ b/src/infiniop/ops/asin/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asin_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAsinDescriptor( + infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsin( + infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc new file mode 100644 index 000000000..e0d5b749a --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc @@ -0,0 +1,48 @@ +#include "asinh_cpu.h" + +namespace op::asinh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asinh::cpu diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h new file mode 100644 index 000000000..0a999b63b --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ASINH_CPU_H__ +#define __ASINH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(asinh, cpu) + +namespace op::asinh::cpu { +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::asinh(x); + } +} AsinhOp; +} // namespace op::asinh::cpu + +#endif // __ASINH_CPU_H__ diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh new file mode 100644 index 000000000..7cb018c8a --- /dev/null +++ b/src/infiniop/ops/asinh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ASINH_CUDA_H__ +#define __ASINH_CUDA_H__ + +#include +#include + +namespace op::asinh::cuda { +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinhf(x); + } else { + return std::asinh(x); + } + } +} AsinhOp; +} // namespace op::asinh::cuda + +#endif // __ASINH_CUDA_H__ diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu new file mode 100644 index 000000000..203008b81 --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asinh_nvidia.cuh" + +namespace op::asinh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asinh::nvidia diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh new file mode 100644 index 000000000..d1dcb4287 --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASINH_NVIDIA_API_H__ +#define __ASINH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asinh, nvidia) + +#endif // __ASINH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc new file mode 100644 index 000000000..d9ff5beda --- /dev/null +++ b/src/infiniop/ops/asinh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asinh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asinh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asinh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAsinhDescriptor( + infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asinh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsinh( + infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc new file mode 100644 index 000000000..a8c613d1e --- /dev/null +++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc @@ -0,0 +1,48 @@ +#include "atan_cpu.h" + +namespace op::atan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atan::cpu diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h new file mode 100644 index 000000000..ac2a1bc0c --- /dev/null +++ b/src/infiniop/ops/atan/cpu/atan_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ATAN_CPU_H__ +#define __ATAN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(atan, cpu) + +namespace op::atan::cpu { +typedef struct AtanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::atan(x); + } +} AtanOp; +} // namespace op::atan::cpu + +#endif // __ATAN_CPU_H__ diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh new file mode 100644 index 000000000..0c7745196 --- /dev/null +++ b/src/infiniop/ops/atan/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ATAN_CUDA_H__ +#define __ATAN_CUDA_H__ + +#include +#include + +namespace op::atan::cuda { +typedef struct AtanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanf(x0), atanf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanf(x); + } else { + return std::atan(x); + } + } +} AtanOp; +} // namespace op::atan::cuda + +#endif // __ATAN_CUDA_H__ diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu new file mode 100644 index 000000000..2c6cf53d4 --- /dev/null +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "atan_nvidia.cuh" + +namespace op::atan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atan::nvidia diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh new file mode 100644 index 000000000..2aaee1ad9 --- /dev/null +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATAN_NVIDIA_API_H__ +#define __ATAN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atan, nvidia) + +#endif // __ATAN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc new file mode 100644 index 000000000..c56e101d2 --- /dev/null +++ b/src/infiniop/ops/atan/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/atan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atan_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAtanDescriptor( + infiniopHandle_t handle, + infiniopAtanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::atan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAtan( + infiniopAtanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc new file mode 100644 index 000000000..66ef4b1df --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc @@ -0,0 +1,48 @@ +#include "atanh_cpu.h" + +namespace op::atanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::cpu diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h new file mode 100644 index 000000000..8c2b04755 --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ATANH_CPU_H__ +#define __ATANH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(atanh, cpu) + +namespace op::atanh::cpu { +typedef struct AtanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::atanh(x); + } +} AtanhOp; +} // namespace op::atanh::cpu + +#endif // __ATANH_CPU_H__ diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh new file mode 100644 index 000000000..5337d8243 --- /dev/null +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ATANH_CUDA_H__ +#define __ATANH_CUDA_H__ + +#include +#include + +namespace op::atanh::cuda { +typedef struct AtanhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanhf(x); + } else { + return std::atanh(x); + } + } +} AtanhOp; +} // namespace op::atanh::cuda + +#endif // __ATANH_CUDA_H__ diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu new file mode 100644 index 000000000..cb5a1ff03 --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "atanh_nvidia.cuh" + +namespace op::atanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::nvidia diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh new file mode 100644 index 000000000..da73cfa99 --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATANH_NVIDIA_API_H__ +#define __ATANH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atanh, nvidia) + +#endif // __ATANH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc new file mode 100644 index 000000000..a73adcb23 --- /dev/null +++ b/src/infiniop/ops/atanh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/atanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atanh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAtanhDescriptor( + infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::atanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAtanh( + infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc new file mode 100644 index 000000000..17b3ec888 --- /dev/null +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc @@ -0,0 +1,48 @@ +#include "ceil_cpu.h" + +namespace op::ceil::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::ceil::cpu diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h new file mode 100644 index 000000000..c3ca8e441 --- /dev/null +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h @@ -0,0 +1,26 @@ +#ifndef __CEIL_CPU_H__ +#define __CEIL_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(ceil, cpu) + +namespace op::ceil::cpu { +typedef struct CeilOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } +} CeilOp; +} // namespace op::ceil::cpu + +#endif // __CEIL_CPU_H__ diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh new file mode 100644 index 000000000..a2d2e7fb5 --- /dev/null +++ b/src/infiniop/ops/ceil/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __CEIL_CUDA_H__ +#define __CEIL_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::ceil::cuda { +typedef struct CeilOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2ceil(x); + } else if constexpr (std::is_same_v) { + return hceil(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(ceilf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return ceilf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } +} CeilOp; +} // namespace op::ceil::cuda + +#endif // __CEIL_CUDA_H__ diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu new file mode 100644 index 000000000..c7ad2ee5b --- /dev/null +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "ceil_nvidia.cuh" + +namespace op::ceil::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::ceil::nvidia diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh new file mode 100644 index 000000000..9bada334d --- /dev/null +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CEIL_NVIDIA_API_H__ +#define __CEIL_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(ceil, nvidia) + +#endif // __CEIL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc new file mode 100644 index 000000000..4e5ee7800 --- /dev/null +++ b/src/infiniop/ops/ceil/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/ceil.h" + +#ifdef ENABLE_CPU_API +#include "cpu/ceil_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/ceil_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCeilDescriptor( + infiniopHandle_t handle, + infiniopCeilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::ceil::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCeil( + infiniopCeilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..9dc68d327 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,48 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..9b4236fc2 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,22 @@ +#ifndef __COS_CPU_H__ +#define __COS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::cos(x); + } +} CosOp; +} // namespace op::cos::cpu + +#endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..b0dabb340 --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __COS_CUDA_H__ +#define __COS_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::cos::cuda { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2cos(x); + } else if constexpr (std::is_same_v) { + return hcos(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(cosf(x0), cosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(cosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __cosf(x); + } else { + return std::cos(x); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..044c59ca0 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..a9866e4d2 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COS_NVIDIA_API_H__ +#define __COS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // __COS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..5c464ad60 --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/cos_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc new file mode 100644 index 000000000..9ed8e33da --- /dev/null +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc @@ -0,0 +1,48 @@ +#include "cosh_cpu.h" + +namespace op::cosh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cosh::cpu diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h new file mode 100644 index 000000000..aea359ef2 --- /dev/null +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __COSH_CPU_H__ +#define __COSH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cosh, cpu) + +namespace op::cosh::cpu { +typedef struct CoshOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::cosh(x); + } +} CoshOp; +} // namespace op::cosh::cpu + +#endif // __COSH_CPU_H__ diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh new file mode 100644 index 000000000..ce6806433 --- /dev/null +++ b/src/infiniop/ops/cosh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __COSH_CUDA_H__ +#define __COSH_CUDA_H__ + +#include +#include + +namespace op::cosh::cuda { +typedef struct CoshOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(coshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(coshf(x0), coshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(coshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return coshf(x); + } else { + return std::cosh(x); + } + } +} CoshOp; +} // namespace op::cosh::cuda + +#endif // __COSH_CUDA_H__ diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu new file mode 100644 index 000000000..a5e1442ce --- /dev/null +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cosh_nvidia.cuh" + +namespace op::cosh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cosh::nvidia diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh new file mode 100644 index 000000000..6a032b0bb --- /dev/null +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COSH_NVIDIA_API_H__ +#define __COSH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cosh, nvidia) + +#endif // __COSH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc new file mode 100644 index 000000000..75aac0c91 --- /dev/null +++ b/src/infiniop/ops/cosh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cosh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cosh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/cosh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCoshDescriptor( + infiniopHandle_t handle, + infiniopCoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cosh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCosh( + infiniopCoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc new file mode 100644 index 000000000..00b1897d1 --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -0,0 +1,48 @@ +#include "erf_cpu.h" + +namespace op::erf::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h new file mode 100644 index 000000000..c26f519cf --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ERF_CPU_H__ +#define __ERF_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(erf, cpu) + +namespace op::erf::cpu { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::erf(x); + } +} ErfOp; +} // namespace op::erf::cpu + +#endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh new file mode 100644 index 000000000..820c10b19 --- /dev/null +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ERF_CUDA_H__ +#define __ERF_CUDA_H__ + +#include +#include + +namespace op::erf::cuda { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(erff(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(erff(x0), erff(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(erff(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return erff(x); + } else { + return std::erf(x); + } + } +} ErfOp; +} // namespace op::erf::cuda + +#endif // __ERF_CUDA_H__ diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu new file mode 100644 index 000000000..9080593de --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erf_nvidia.cuh" + +namespace op::erf::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh new file mode 100644 index 000000000..0621150fa --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERF_NVIDIA_API_H__ +#define __ERF_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erf, nvidia) + +#endif // __ERF_NVIDIA_API_H__ diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc new file mode 100644 index 000000000..1491cfa9a --- /dev/null +++ b/src/infiniop/ops/erf/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erf.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erf_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/erf_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateErfDescriptor( + infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erf::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopErf( + infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc new file mode 100644 index 000000000..e809a02e2 --- /dev/null +++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc @@ -0,0 +1,48 @@ +#include "floor_cpu.h" + +namespace op::floor::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor::cpu diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h new file mode 100644 index 000000000..91508a384 --- /dev/null +++ b/src/infiniop/ops/floor/cpu/floor_cpu.h @@ -0,0 +1,26 @@ +#ifndef __FLOOR_CPU_H__ +#define __FLOOR_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(floor, cpu) + +namespace op::floor::cpu { +typedef struct FloorOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } +} FloorOp; +} // namespace op::floor::cpu + +#endif // __FLOOR_CPU_H__ diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh new file mode 100644 index 000000000..c89ce34f4 --- /dev/null +++ b/src/infiniop/ops/floor/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __FLOOR_CUDA_H__ +#define __FLOOR_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::floor::cuda { +typedef struct FloorOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2floor(x); + } else if constexpr (std::is_same_v) { + return hfloor(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(floorf(x0), floorf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(floorf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return floorf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } +} FloorOp; +} // namespace op::floor::cuda + +#endif // __FLOOR_CUDA_H__ diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu new file mode 100644 index 000000000..08305048a --- /dev/null +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "floor_nvidia.cuh" + +namespace op::floor::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor::nvidia diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh new file mode 100644 index 000000000..7a3c2f5c7 --- /dev/null +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FLOOR_NVIDIA_API_H__ +#define __FLOOR_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(floor, nvidia) + +#endif // __FLOOR_NVIDIA_API_H__ diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc new file mode 100644 index 000000000..4e4ed2b5a --- /dev/null +++ b/src/infiniop/ops/floor/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/floor.h" + +#ifdef ENABLE_CPU_API +#include "cpu/floor_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/floor_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateFloorDescriptor( + infiniopHandle_t handle, + infiniopFloorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::floor::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFloor( + infiniopFloorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc new file mode 100644 index 000000000..e7314c319 --- /dev/null +++ b/src/infiniop/ops/log/cpu/log_cpu.cc @@ -0,0 +1,48 @@ +#include "log_cpu.h" + +namespace op::log::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log::cpu diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h new file mode 100644 index 000000000..535e681d3 --- /dev/null +++ b/src/infiniop/ops/log/cpu/log_cpu.h @@ -0,0 +1,22 @@ +#ifndef __LOG_CPU_H__ +#define __LOG_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(log, cpu) + +namespace op::log::cpu { +typedef struct LogOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::log(x); + } +} LogOp; +} // namespace op::log::cpu + +#endif // __LOG_CPU_H__ diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh new file mode 100644 index 000000000..b1e46873c --- /dev/null +++ b/src/infiniop/ops/log/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __LOG_CUDA_H__ +#define __LOG_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::log::cuda { +typedef struct LogOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2log(x); + } else if constexpr (std::is_same_v) { + return __float2half(__logf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(logf(x0), logf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(logf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __logf(x); + } else { + return std::log(x); + } + } +} LogOp; +} // namespace op::log::cuda + +#endif // __LOG_CUDA_H__ diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu new file mode 100644 index 000000000..9e7bcafc4 --- /dev/null +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "log_nvidia.cuh" + +namespace op::log::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log::nvidia diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cuh b/src/infiniop/ops/log/nvidia/log_nvidia.cuh new file mode 100644 index 000000000..c48841622 --- /dev/null +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG_NVIDIA_API_H__ +#define __LOG_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log, nvidia) + +#endif // __LOG_NVIDIA_API_H__ diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc new file mode 100644 index 000000000..8f2add408 --- /dev/null +++ b/src/infiniop/ops/log/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/log.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/log_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateLogDescriptor( + infiniopHandle_t handle, + infiniopLogDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::log::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLog( + infiniopLogDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc new file mode 100644 index 000000000..5da2ae4c3 --- /dev/null +++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc @@ -0,0 +1,48 @@ +#include "neg_cpu.h" + +namespace op::neg::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::neg::cpu diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h new file mode 100644 index 000000000..ea45989b3 --- /dev/null +++ b/src/infiniop/ops/neg/cpu/neg_cpu.h @@ -0,0 +1,20 @@ +#ifndef __NEG_CPU_H__ +#define __NEG_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(neg, cpu) + +namespace op::neg::cpu { +typedef struct NegOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return -x; + } +} NegOp; +} // namespace op::neg::cpu + +#endif // __NEG_CPU_H__ diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh new file mode 100644 index 000000000..57904b3df --- /dev/null +++ b/src/infiniop/ops/neg/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __NEG_CUDA_H__ +#define __NEG_CUDA_H__ + +#include + +namespace op::neg::cuda { +typedef struct NegOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __hneg2(x); + } else if constexpr (std::is_same_v) { + return __hneg(x); + } else { + return -x; + } + } +} NegOp; +} // namespace op::neg::cuda + +#endif // __NEG_CUDA_H__ diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu new file mode 100644 index 000000000..d18b8bf25 --- /dev/null +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "neg_nvidia.cuh" + +namespace op::neg::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::neg::nvidia diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh new file mode 100644 index 000000000..1265cd3df --- /dev/null +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __NEG_NVIDIA_API_H__ +#define __NEG_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(neg, nvidia) + +#endif // __NEG_NVIDIA_API_H__ diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc new file mode 100644 index 000000000..d4134df3e --- /dev/null +++ b/src/infiniop/ops/neg/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/neg.h" + +#ifdef ENABLE_CPU_API +#include "cpu/neg_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/neg_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateNegDescriptor( + infiniopHandle_t handle, + infiniopNegDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::neg::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopNeg( + infiniopNegDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh index e8b5324a0..3786e7a52 100644 --- a/src/infiniop/ops/pow/cuda/kernel.cuh +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -2,8 +2,8 @@ #define __POW_CUDA_H__ #include -#include #include +#include namespace op::pow::cuda { typedef struct PowOp { diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc new file mode 100644 index 000000000..52874c8b3 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc @@ -0,0 +1,48 @@ +#include "reciprocal_cpu.h" + +namespace op::reciprocal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::cpu diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h new file mode 100644 index 000000000..0a0f223f0 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h @@ -0,0 +1,20 @@ +#ifndef __RECIPROCAL_CPU_H__ +#define __RECIPROCAL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(reciprocal, cpu) + +namespace op::reciprocal::cpu { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return T(1) / x; + } +} ReciprocalOp; +} // namespace op::reciprocal::cpu + +#endif // __RECIPROCAL_CPU_H__ diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh new file mode 100644 index 000000000..94c71de90 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __RECIPROCAL_CUDA_H__ +#define __RECIPROCAL_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::reciprocal::cuda { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2rcp(x); + } else if constexpr (std::is_same_v) { + return hrcp(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(x); + } else { + return T(1) / x; + } + } +} ReciprocalOp; +} // namespace op::reciprocal::cuda + +#endif // __RECIPROCAL_CUDA_H__ diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu new file mode 100644 index 000000000..45b74e25e --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "reciprocal_nvidia.cuh" + +namespace op::reciprocal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::nvidia diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh new file mode 100644 index 000000000..d98c8f4c2 --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RECIPROCAL_NVIDIA_API_H__ +#define __RECIPROCAL_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia) + +#endif // __RECIPROCAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc new file mode 100644 index 000000000..033286024 --- /dev/null +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reciprocal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reciprocal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/reciprocal_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateReciprocalDescriptor( + infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reciprocal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReciprocal( + infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc new file mode 100644 index 000000000..0b0cea7b7 --- /dev/null +++ b/src/infiniop/ops/round/cpu/round_cpu.cc @@ -0,0 +1,48 @@ +#include "round_cpu.h" + +namespace op::round::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::round::cpu diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h new file mode 100644 index 000000000..eccd6df0f --- /dev/null +++ b/src/infiniop/ops/round/cpu/round_cpu.h @@ -0,0 +1,25 @@ +#ifndef __ROUND_CPU_H__ +#define __ROUND_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(round, cpu) + +namespace op::round::cpu { +typedef struct RoundOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } +} RoundOp; +} // namespace op::round::cpu + +#endif // __ROUND_CPU_H__ diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh new file mode 100644 index 000000000..c52a10716 --- /dev/null +++ b/src/infiniop/ops/round/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __ROUND_CUDA_H__ +#define __ROUND_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::round::cuda { +typedef struct RoundOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2rint(x); + } else if constexpr (std::is_same_v) { + return hrint(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(rintf(x0), rintf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(rintf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return rintf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } +} RoundOp; +} // namespace op::round::cuda + +#endif // __ROUND_CUDA_H__ diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu new file mode 100644 index 000000000..c1fabc885 --- /dev/null +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "round_nvidia.cuh" + +namespace op::round::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::round::nvidia diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cuh b/src/infiniop/ops/round/nvidia/round_nvidia.cuh new file mode 100644 index 000000000..65bb38566 --- /dev/null +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ROUND_NVIDIA_API_H__ +#define __ROUND_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(round, nvidia) + +#endif // __ROUND_NVIDIA_API_H__ diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc new file mode 100644 index 000000000..9468803c8 --- /dev/null +++ b/src/infiniop/ops/round/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/round.h" + +#ifdef ENABLE_CPU_API +#include "cpu/round_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/round_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateRoundDescriptor( + infiniopHandle_t handle, + infiniopRoundDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::round::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopRound( + infiniopRoundDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc new file mode 100644 index 000000000..1f3430e73 --- /dev/null +++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc @@ -0,0 +1,48 @@ +#include "sign_cpu.h" + +namespace op::sign::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sign::cpu diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h new file mode 100644 index 000000000..505194c85 --- /dev/null +++ b/src/infiniop/ops/sign/cpu/sign_cpu.h @@ -0,0 +1,20 @@ +#ifndef __SIGN_CPU_H__ +#define __SIGN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sign, cpu) + +namespace op::sign::cpu { +typedef struct SignOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } +} SignOp; +} // namespace op::sign::cpu + +#endif // __SIGN_CPU_H__ diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh new file mode 100644 index 000000000..3737282b0 --- /dev/null +++ b/src/infiniop/ops/sign/cuda/kernel.cuh @@ -0,0 +1,25 @@ +#ifndef __SIGN_CUDA_H__ +#define __SIGN_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::sign::cuda { +typedef struct SignOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); + return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); + } else if constexpr (std::is_same_v) { + return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); + } else { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } + } +} SignOp; +} // namespace op::sign::cuda + +#endif // __SIGN_CUDA_H__ diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu new file mode 100644 index 000000000..6a3152e41 --- /dev/null +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sign_nvidia.cuh" + +namespace op::sign::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sign::nvidia diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh new file mode 100644 index 000000000..d5f2540a3 --- /dev/null +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGN_NVIDIA_API_H__ +#define __SIGN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sign, nvidia) + +#endif // __SIGN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc new file mode 100644 index 000000000..8f658a9b3 --- /dev/null +++ b/src/infiniop/ops/sign/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sign.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sign_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sign_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSignDescriptor( + infiniopHandle_t handle, + infiniopSignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sign::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSign( + infiniopSignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc new file mode 100644 index 000000000..40685847d --- /dev/null +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc @@ -0,0 +1,48 @@ +#include "sinh_cpu.h" + +namespace op::sinh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sinh::cpu diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h new file mode 100644 index 000000000..dbc8f3c7e --- /dev/null +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __SINH_CPU_H__ +#define __SINH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sinh, cpu) + +namespace op::sinh::cpu { +typedef struct SinhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::sinh(x); + } +} SinhOp; +} // namespace op::sinh::cpu + +#endif // __SINH_CPU_H__ diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh new file mode 100644 index 000000000..c09150666 --- /dev/null +++ b/src/infiniop/ops/sinh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __SINH_CUDA_H__ +#define __SINH_CUDA_H__ + +#include +#include + +namespace op::sinh::cuda { +typedef struct SinhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(sinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sinhf(x0), sinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return sinhf(x); + } else { + return std::sinh(x); + } + } +} SinhOp; +} // namespace op::sinh::cuda + +#endif // __SINH_CUDA_H__ diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu new file mode 100644 index 000000000..d4c3fd165 --- /dev/null +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sinh_nvidia.cuh" + +namespace op::sinh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sinh::nvidia diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh new file mode 100644 index 000000000..66e3e3e67 --- /dev/null +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SINH_NVIDIA_API_H__ +#define __SINH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sinh, nvidia) + +#endif // __SINH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc new file mode 100644 index 000000000..1636ce2c8 --- /dev/null +++ b/src/infiniop/ops/sinh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sinh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sinh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sinh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSinhDescriptor( + infiniopHandle_t handle, + infiniopSinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sinh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSinh( + infiniopSinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc new file mode 100644 index 000000000..99e723126 --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -0,0 +1,48 @@ +#include "sqrt_cpu.h" + +namespace op::sqrt::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sqrt::cpu diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h new file mode 100644 index 000000000..3d026cf63 --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h @@ -0,0 +1,22 @@ +#ifndef __SQRT_CPU_H__ +#define __SQRT_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sqrt, cpu) + +namespace op::sqrt::cpu { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::sqrt(x); + } +} SqrtOp; +} // namespace op::sqrt::cpu + +#endif // __SQRT_CPU_H__ diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh new file mode 100644 index 000000000..c82cd7dd5 --- /dev/null +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __SQRT_CUDA_H__ +#define __SQRT_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::sqrt::cuda { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2sqrt(x); + } else if constexpr (std::is_same_v) { + return hsqrt(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __fsqrt_rn(x); + } else { + return std::sqrt(x); + } + } +} SqrtOp; +} // namespace op::sqrt::cuda + +#endif // __SQRT_CUDA_H__ diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu new file mode 100644 index 000000000..519d06e89 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sqrt_nvidia.cuh" + +namespace op::sqrt::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sqrt::nvidia diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh new file mode 100644 index 000000000..6cd98c814 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SQRT_NVIDIA_API_H__ +#define __SQRT_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sqrt, nvidia) + +#endif // __SQRT_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc new file mode 100644 index 000000000..b11c8a4b5 --- /dev/null +++ b/src/infiniop/ops/sqrt/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sqrt.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sqrt_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sqrt_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSqrtDescriptor( + infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sqrt::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSqrt( + infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc new file mode 100644 index 000000000..2947dfc5e --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -0,0 +1,48 @@ +#include "tan_cpu.h" + +namespace op::tan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h new file mode 100644 index 000000000..c3a22456c --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -0,0 +1,22 @@ +#ifndef __TAN_CPU_H__ +#define __TAN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tan, cpu) + +namespace op::tan::cpu { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::tan(x); + } +} TanOp; +} // namespace op::tan::cpu + +#endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh new file mode 100644 index 000000000..bbd8facaa --- /dev/null +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -0,0 +1,55 @@ +#ifndef __TAN_CUDA_H__ +#define __TAN_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +#define TAN_THRESHOLD 15000 + +namespace op::tan::cuda { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2sin(x) / h2cos(x); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(__half2float(x)); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return __float2half(tanf(__half2float(x))); + } + return __float2half(tan_f); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + float tan_f0 = __tanf(x0); + float tan_f1 = __tanf(x1); + if (std::fabs(tan_f0) > TAN_THRESHOLD) { + tan_f0 = tanf(x0); + } + if (std::fabs(tan_f1) > TAN_THRESHOLD) { + tan_f1 = tanf(x1); + } + return __floats2bfloat162_rn(tan_f0, tan_f1); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(__bfloat162float(x)); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return __float2bfloat16_rn(tanf(__bfloat162float(x))); + } + return __float2bfloat16_rn(tan_f); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(x); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return tanf(x); + } + return tan_f; + } else { + return std::tan(x); + } + } +} TanOp; +} // namespace op::tan::cuda + +#endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu new file mode 100644 index 000000000..b4c24e2fe --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tan_nvidia.cuh" + +namespace op::tan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh new file mode 100644 index 000000000..ec620cbeb --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAN_NVIDIA_API_H__ +#define __TAN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tan, nvidia) + +#endif // __TAN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc new file mode 100644 index 000000000..48ae8d48e --- /dev/null +++ b/src/infiniop/ops/tan/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/tan_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py new file mode 100644 index 000000000..df8748a97 --- /dev/null +++ b/test/infiniop/abs.py @@ -0,0 +1,164 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def abs_op(x): + return torch.abs(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for abs operation + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = abs_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAbsDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAbsWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_abs(): + check_error( + LIBINFINIOP.infiniopAbs( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_abs() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py new file mode 100644 index 000000000..d39e966c4 --- /dev/null +++ b/test/infiniop/acos.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def acos_op(x): + return torch.acos(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for acos operation + # acos domain is [-1, 1], so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = acos_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAcosDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAcosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_acos(): + check_error( + LIBINFINIOP.infiniopAcos( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_acos() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py new file mode 100644 index 000000000..c6777998b --- /dev/null +++ b/test/infiniop/acosh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def acosh_op(x): + return torch.acosh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [1, 101) for acosh operation + # acosh domain is [1, +∞), so we use range [1, 101) + x_torch_tensor = torch.rand(shape) * 100 + 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = acosh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAcoshDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAcoshWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_acosh(): + check_error( + LIBINFINIOP.infiniopAcosh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_acosh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py new file mode 100644 index 000000000..18cf0ec8e --- /dev/null +++ b/test/infiniop/asin.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def asin_op(x): + return torch.asin(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for asin operation + # asin domain is [-1, 1], so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = asin_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAsinDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAsinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_asin(): + check_error( + LIBINFINIOP.infiniopAsin( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_asin() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py new file mode 100644 index 000000000..d051d486e --- /dev/null +++ b/test/infiniop/asinh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def asinh_op(x): + return torch.asinh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0, 100) for asinh operation + # asinh domain is (-∞, +∞), so we use range [0, 100) + x_torch_tensor = torch.rand(shape) * 100 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = asinh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAsinhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAsinhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_asinh(): + check_error( + LIBINFINIOP.infiniopAsinh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_asinh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py new file mode 100644 index 000000000..01fceff5b --- /dev/null +++ b/test/infiniop/atan.py @@ -0,0 +1,164 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3, 13, 9, 17),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def atan_op(x): + return torch.atan(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for atan operation + # atan domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = atan_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAtanDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAtanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_atan(): + check_error( + LIBINFINIOP.infiniopAtan( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_atan() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py new file mode 100644 index 000000000..74073a6f2 --- /dev/null +++ b/test/infiniop/atanh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def atanh_op(x): + return torch.atanh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for atanh operation + # atanh domain is (-1, 1), so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = atanh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAtanhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAtanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_atanh(): + check_error( + LIBINFINIOP.infiniopAtanh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_atanh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py new file mode 100644 index 000000000..afc1993c1 --- /dev/null +++ b/test/infiniop/ceil.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def ceil_op(x): + return torch.ceil(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for ceil operation + # ceil domain is (-∞, +∞), so we use range [-20, -10) + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = ceil_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCeilDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCeilWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_ceil(): + check_error( + LIBINFINIOP.infiniopCeil( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_ceil() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..972f17b7b --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-4, rtol=1e-2 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos_op(x): + return torch.cos(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for cos operation + # cos domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cos_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_cos() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py new file mode 100644 index 000000000..ee7994531 --- /dev/null +++ b/test/infiniop/cosh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cosh_op(x): + return torch.cosh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for cosh operation + # cosh domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cosh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCoshDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCoshWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_cosh(): + check_error( + LIBINFINIOP.infiniopCosh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_cosh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py new file mode 100644 index 000000000..f5f9c4cd9 --- /dev/null +++ b/test/infiniop/erf.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def erf_op(x): + return torch.erf(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-3, 3) for erf operation + # erf domain is (-∞, +∞), so we use range [-3, 3) + x_torch_tensor = torch.rand(shape) * 6 - 3 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = erf_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_erf(): + check_error( + LIBINFINIOP.infiniopErf( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_erf() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py new file mode 100644 index 000000000..b981da809 --- /dev/null +++ b/test/infiniop/floor.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def floor_op(x): + return torch.floor(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for floor operation + # floor domain is (-∞, +∞), so we use range [-20, -10) + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = floor_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateFloorDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetFloorWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_floor(): + check_error( + LIBINFINIOP.infiniopFloor( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_floor() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index a61cea018..20a9188d6 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -496,6 +496,589 @@ def rearrange_(lib): lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t] +@OpRegister.operator +def abs_(lib): + lib.infiniopCreateAbsDescriptor.restype = c_int32 + lib.infiniopCreateAbsDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetAbsWorkspaceSize.restype = c_int32 + lib.infiniopGetAbsWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAbs.restype = c_int32 + lib.infiniopAbs.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAbsDescriptor.restype = c_int32 + lib.infiniopDestroyAbsDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def acos_(lib): + lib.infiniopCreateAcosDescriptor.restype = c_int32 + lib.infiniopCreateAcosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAcosWorkspaceSize.restype = c_int32 + lib.infiniopGetAcosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAcos.restype = c_int32 + lib.infiniopAcos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAcosDescriptor.restype = c_int32 + lib.infiniopDestroyAcosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def acosh_(lib): + lib.infiniopCreateAcoshDescriptor.restype = c_int32 + lib.infiniopCreateAcoshDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAcoshWorkspaceSize.restype = c_int32 + lib.infiniopGetAcoshWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAcosh.restype = c_int32 + lib.infiniopAcosh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAcoshDescriptor.restype = c_int32 + lib.infiniopDestroyAcoshDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def asin_(lib): + lib.infiniopCreateAsinDescriptor.restype = c_int32 + lib.infiniopCreateAsinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAsinWorkspaceSize.restype = c_int32 + lib.infiniopGetAsinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAsin.restype = c_int32 + lib.infiniopAsin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAsinDescriptor.restype = c_int32 + lib.infiniopDestroyAsinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def asinh_(lib): + lib.infiniopCreateAsinhDescriptor.restype = c_int32 + lib.infiniopCreateAsinhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAsinhWorkspaceSize.restype = c_int32 + lib.infiniopGetAsinhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAsinh.restype = c_int32 + lib.infiniopAsinh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAsinhDescriptor.restype = c_int32 + lib.infiniopDestroyAsinhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def atan_(lib): + lib.infiniopCreateAtanDescriptor.restype = c_int32 + lib.infiniopCreateAtanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAtanWorkspaceSize.restype = c_int32 + lib.infiniopGetAtanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAtan.restype = c_int32 + lib.infiniopAtan.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAtanDescriptor.restype = c_int32 + lib.infiniopDestroyAtanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def atanh_(lib): + lib.infiniopCreateAtanhDescriptor.restype = c_int32 + lib.infiniopCreateAtanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAtanhWorkspaceSize.restype = c_int32 + lib.infiniopGetAtanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAtanh.restype = c_int32 + lib.infiniopAtanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAtanhDescriptor.restype = c_int32 + lib.infiniopDestroyAtanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def ceil_(lib): + lib.infiniopCreateCeilDescriptor.restype = c_int32 + lib.infiniopCreateCeilDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCeilWorkspaceSize.restype = c_int32 + lib.infiniopGetCeilWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCeil.restype = c_int32 + lib.infiniopCeil.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCeilDescriptor.restype = c_int32 + lib.infiniopDestroyCeilDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCosWorkspaceSize.restype = c_int32 + lib.infiniopGetCosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cosh_(lib): + lib.infiniopCreateCoshDescriptor.restype = c_int32 + lib.infiniopCreateCoshDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCoshWorkspaceSize.restype = c_int32 + lib.infiniopGetCoshWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCosh.restype = c_int32 + lib.infiniopCosh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCoshDescriptor.restype = c_int32 + lib.infiniopDestroyCoshDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sinh_(lib): + lib.infiniopCreateSinhDescriptor.restype = c_int32 + lib.infiniopCreateSinhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSinhWorkspaceSize.restype = c_int32 + lib.infiniopGetSinhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSinh.restype = c_int32 + lib.infiniopSinh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySinhDescriptor.restype = c_int32 + lib.infiniopDestroySinhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erf_(lib): + lib.infiniopCreateErfDescriptor.restype = c_int32 + lib.infiniopCreateErfDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetErfWorkspaceSize.restype = c_int32 + lib.infiniopGetErfWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopErf.restype = c_int32 + lib.infiniopErf.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyErfDescriptor.restype = c_int32 + lib.infiniopDestroyErfDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def floor_(lib): + lib.infiniopCreateFloorDescriptor.restype = c_int32 + lib.infiniopCreateFloorDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetFloorWorkspaceSize.restype = c_int32 + lib.infiniopGetFloorWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopFloor.restype = c_int32 + lib.infiniopFloor.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyFloorDescriptor.restype = c_int32 + lib.infiniopDestroyFloorDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def neg_(lib): + lib.infiniopCreateNegDescriptor.restype = c_int32 + lib.infiniopCreateNegDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetNegWorkspaceSize.restype = c_int32 + lib.infiniopGetNegWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopNeg.restype = c_int32 + lib.infiniopNeg.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyNegDescriptor.restype = c_int32 + lib.infiniopDestroyNegDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def reciprocal_(lib): + lib.infiniopCreateReciprocalDescriptor.restype = c_int32 + lib.infiniopCreateReciprocalDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32 + lib.infiniopGetReciprocalWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopReciprocal.restype = c_int32 + lib.infiniopReciprocal.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReciprocalDescriptor.restype = c_int32 + lib.infiniopDestroyReciprocalDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def round_(lib): + lib.infiniopCreateRoundDescriptor.restype = c_int32 + lib.infiniopCreateRoundDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetRoundWorkspaceSize.restype = c_int32 + lib.infiniopGetRoundWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopRound.restype = c_int32 + lib.infiniopRound.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRoundDescriptor.restype = c_int32 + lib.infiniopDestroyRoundDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sign_(lib): + lib.infiniopCreateSignDescriptor.restype = c_int32 + lib.infiniopCreateSignDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSignWorkspaceSize.restype = c_int32 + lib.infiniopGetSignWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSign.restype = c_int32 + lib.infiniopSign.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySignDescriptor.restype = c_int32 + lib.infiniopDestroySignDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sqrt_(lib): + lib.infiniopCreateSqrtDescriptor.restype = c_int32 + lib.infiniopCreateSqrtDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSqrtWorkspaceSize.restype = c_int32 + lib.infiniopGetSqrtWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSqrt.restype = c_int32 + lib.infiniopSqrt.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySqrtDescriptor.restype = c_int32 + lib.infiniopDestroySqrtDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def log_(lib): + lib.infiniopCreateLogDescriptor.restype = c_int32 + lib.infiniopCreateLogDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLogWorkspaceSize.restype = c_int32 + lib.infiniopGetLogWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLog.restype = c_int32 + lib.infiniopLog.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLogDescriptor.restype = c_int32 + lib.infiniopDestroyLogDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def tan_(lib): + lib.infiniopCreateTanDescriptor.restype = c_int32 + lib.infiniopCreateTanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetTanWorkspaceSize.restype = c_int32 + lib.infiniopGetTanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopTan.restype = c_int32 + lib.infiniopTan.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyTanDescriptor.restype = c_int32 + lib.infiniopDestroyTanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def relu_(lib): lib.infiniopCreateReluDescriptor.restype = c_int32 diff --git a/test/infiniop/log.py b/test/infiniop/log.py new file mode 100644 index 000000000..4f97de374 --- /dev/null +++ b/test/infiniop/log.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-7, rtol=1e-3 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def log_op(x): + return torch.log(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0.1, 1.1) for log operation + # log domain is (0, +∞), so we use range [0.1, 1.1) + x_torch_tensor = torch.rand(shape) + 0.1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = log_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_log(): + check_error( + LIBINFINIOP.infiniopLog( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_log() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py new file mode 100644 index 000000000..62607bce0 --- /dev/null +++ b/test/infiniop/neg.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def neg_op(x): + return torch.neg(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for neg operation + # This matches the original test case: * 100 - 200 + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = neg_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateNegDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetNegWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_neg(): + check_error( + LIBINFINIOP.infiniopNeg( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_neg() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py new file mode 100644 index 000000000..4e816481c --- /dev/null +++ b/test/infiniop/reciprocal.py @@ -0,0 +1,168 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reciprocal_op(x): + return torch.reciprocal(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-10, 10) for reciprocal operation + # This matches the original test case: * 20 - 10 + # Note: Avoid values too close to zero to prevent division by zero issues + x_torch_tensor = torch.rand(shape) * 20 - 10 + # Ensure no zero values + x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor) + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = reciprocal_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReciprocalDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReciprocalWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_reciprocal(): + check_error( + LIBINFINIOP.infiniopReciprocal( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_reciprocal() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/round.py b/test/infiniop/round.py new file mode 100644 index 000000000..d6053f676 --- /dev/null +++ b/test/infiniop/round.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def round_op(x): + return torch.round(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for round operation + # This matches the original test case: * 10 - 20 + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = round_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateRoundDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetRoundWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_round(): + check_error( + LIBINFINIOP.infiniopRound( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_round() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py new file mode 100644 index 000000000..f0eb5b5f8 --- /dev/null +++ b/test/infiniop/sign.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=0 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sign_op(x): + return torch.sign(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for sign operation + # sign domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sign_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSignDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSignWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sign(): + check_error( + LIBINFINIOP.infiniopSign( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sign() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py new file mode 100644 index 000000000..99bc02c58 --- /dev/null +++ b/test/infiniop/sinh.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=0 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sinh_op(x): + return torch.sinh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for sinh operation + # sinh domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sinh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sinh(): + check_error( + LIBINFINIOP.infiniopSinh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sinh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py new file mode 100644 index 000000000..6e1419971 --- /dev/null +++ b/test/infiniop/sqrt.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=1e-3 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sqrt_op(x): + return torch.sqrt(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0, 100) for sqrt operation + # sqrt domain is [0, +∞), so we use range [0, 100) + x_torch_tensor = torch.rand(shape) * 100 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sqrt_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSqrtDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSqrtWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sqrt(): + check_error( + LIBINFINIOP.infiniopSqrt( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sqrt() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py new file mode 100644 index 000000000..877f5dd58 --- /dev/null +++ b/test/infiniop/tan.py @@ -0,0 +1,167 @@ +import ctypes +import math +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-6, rtol=1e-2 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tan_op(x): + return torch.tan(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-2π, 2π) for tan operation + # tan domain is (-∞, +∞), so we use range [-2π, 2π) + x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = tan_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_tan(): + check_error( + LIBINFINIOP.infiniopTan( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_tan() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 8ca4b3248f4d5b7b0bd4de4184957a8e9d93dbb7 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Thu, 15 Jan 2026 02:33:13 +0000 Subject: [PATCH 3/3] Issue/887 - Refactor binary and unary operators to reduce code duplication. --- include/infiniop/ops/abs.h | 20 +- include/infiniop/ops/acos.h | 20 +- include/infiniop/ops/acosh.h | 20 +- include/infiniop/ops/add.h | 22 +- include/infiniop/ops/asin.h | 20 +- include/infiniop/ops/asinh.h | 20 +- include/infiniop/ops/atan.h | 20 +- include/infiniop/ops/atanh.h | 20 +- include/infiniop/ops/binary_op_api.h | 50 ++ include/infiniop/ops/ceil.h | 20 +- include/infiniop/ops/cos.h | 20 +- include/infiniop/ops/cosh.h | 20 +- include/infiniop/ops/div.h | 22 +- include/infiniop/ops/erf.h | 20 +- include/infiniop/ops/floor.h | 20 +- include/infiniop/ops/log.h | 20 +- include/infiniop/ops/max.h | 22 +- include/infiniop/ops/min.h | 22 +- include/infiniop/ops/mod.h | 22 +- include/infiniop/ops/mul.h | 22 +- include/infiniop/ops/neg.h | 20 +- include/infiniop/ops/pow.h | 22 +- include/infiniop/ops/reciprocal.h | 20 +- include/infiniop/ops/round.h | 20 +- include/infiniop/ops/sign.h | 20 +- include/infiniop/ops/sinh.h | 20 +- include/infiniop/ops/sqrt.h | 20 +- include/infiniop/ops/sub.h | 22 +- include/infiniop/ops/tan.h | 20 +- include/infiniop/ops/unary_op_api.h | 48 ++ scripts/test_binary_unary.py | 143 +++++ src/infiniop/elementwise/binary.h | 261 +++++++++ .../elementwise/cpu/elementwise_cpu_impl.h | 130 +++++ .../nvidia/elementwise_nvidia_impl.cuh | 134 +++++ src/infiniop/elementwise/unary.h | 524 ++++++++++++++++++ src/infiniop/operator_impl.h | 288 ++++++++++ src/infiniop/ops/abs/cpu/abs_cpu.cc | 44 +- src/infiniop/ops/abs/cpu/abs_cpu.h | 21 +- src/infiniop/ops/abs/cuda/kernel.cuh | 20 +- src/infiniop/ops/abs/nvidia/abs_nvidia.cu | 48 +- src/infiniop/ops/abs/operator.cc | 132 +---- src/infiniop/ops/acos/cpu/acos_cpu.cc | 44 +- src/infiniop/ops/acos/cpu/acos_cpu.h | 17 +- src/infiniop/ops/acos/cuda/kernel.cuh | 26 +- src/infiniop/ops/acos/nvidia/acos_nvidia.cu | 48 +- src/infiniop/ops/acos/operator.cc | 132 +---- src/infiniop/ops/acosh/cpu/acosh_cpu.cc | 44 +- src/infiniop/ops/acosh/cpu/acosh_cpu.h | 17 +- src/infiniop/ops/acosh/cuda/kernel.cuh | 26 +- src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu | 48 +- src/infiniop/ops/acosh/operator.cc | 132 +---- src/infiniop/ops/asin/cpu/asin_cpu.cc | 44 +- src/infiniop/ops/asin/cpu/asin_cpu.h | 17 +- src/infiniop/ops/asin/cuda/kernel.cuh | 26 +- src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 48 +- src/infiniop/ops/asin/operator.cc | 132 +---- src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 44 +- src/infiniop/ops/asinh/cpu/asinh_cpu.h | 17 +- src/infiniop/ops/asinh/cuda/kernel.cuh | 26 +- src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 48 +- src/infiniop/ops/asinh/operator.cc | 132 +---- src/infiniop/ops/atan/cpu/atan_cpu.cc | 44 +- src/infiniop/ops/atan/cpu/atan_cpu.h | 17 +- src/infiniop/ops/atan/cuda/kernel.cuh | 26 +- src/infiniop/ops/atan/nvidia/atan_nvidia.cu | 48 +- src/infiniop/ops/atan/operator.cc | 132 +---- src/infiniop/ops/atanh/cpu/atanh_cpu.cc | 44 +- src/infiniop/ops/atanh/cpu/atanh_cpu.h | 17 +- src/infiniop/ops/atanh/cuda/kernel.cuh | 26 +- src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu | 48 +- src/infiniop/ops/atanh/operator.cc | 132 +---- src/infiniop/ops/ceil/cpu/ceil_cpu.cc | 44 +- src/infiniop/ops/ceil/cpu/ceil_cpu.h | 21 +- src/infiniop/ops/ceil/cuda/kernel.cuh | 28 +- src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu | 48 +- src/infiniop/ops/ceil/operator.cc | 132 +---- src/infiniop/ops/cos/cpu/cos_cpu.cc | 44 +- src/infiniop/ops/cos/cpu/cos_cpu.h | 17 +- src/infiniop/ops/cos/cuda/kernel.cuh | 26 +- src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 48 +- src/infiniop/ops/cos/operator.cc | 132 +---- src/infiniop/ops/cosh/cpu/cosh_cpu.cc | 44 +- src/infiniop/ops/cosh/cpu/cosh_cpu.h | 17 +- src/infiniop/ops/cosh/cuda/kernel.cuh | 26 +- src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu | 48 +- src/infiniop/ops/cosh/operator.cc | 132 +---- src/infiniop/ops/div/cpu/div_cpu.cc | 46 +- src/infiniop/ops/div/cpu/div_cpu.h | 14 +- src/infiniop/ops/div/cuda/kernel.cuh | 19 +- src/infiniop/ops/div/nvidia/div_nvidia.cu | 51 +- src/infiniop/ops/div/operator.cc | 195 +------ src/infiniop/ops/erf/cpu/erf_cpu.cc | 44 +- src/infiniop/ops/erf/cpu/erf_cpu.h | 17 +- src/infiniop/ops/erf/cuda/kernel.cuh | 26 +- src/infiniop/ops/erf/nvidia/erf_nvidia.cu | 48 +- src/infiniop/ops/erf/operator.cc | 132 +---- src/infiniop/ops/floor/cpu/floor_cpu.cc | 44 +- src/infiniop/ops/floor/cpu/floor_cpu.h | 21 +- src/infiniop/ops/floor/cuda/kernel.cuh | 28 +- src/infiniop/ops/floor/nvidia/floor_nvidia.cu | 48 +- src/infiniop/ops/floor/operator.cc | 132 +---- src/infiniop/ops/log/cpu/log_cpu.cc | 44 +- src/infiniop/ops/log/cpu/log_cpu.h | 17 +- src/infiniop/ops/log/cuda/kernel.cuh | 26 +- src/infiniop/ops/log/nvidia/log_nvidia.cu | 48 +- src/infiniop/ops/log/operator.cc | 132 +---- src/infiniop/ops/max/cpu/max_cpu.cc | 46 +- src/infiniop/ops/max/cpu/max_cpu.h | 15 +- src/infiniop/ops/max/cuda/kernel.cuh | 19 +- src/infiniop/ops/max/nvidia/max_nvidia.cu | 51 +- src/infiniop/ops/max/operator.cc | 195 +------ src/infiniop/ops/min/cpu/min_cpu.cc | 46 +- src/infiniop/ops/min/cpu/min_cpu.h | 15 +- src/infiniop/ops/min/cuda/kernel.cuh | 19 +- src/infiniop/ops/min/nvidia/min_nvidia.cu | 51 +- src/infiniop/ops/min/operator.cc | 195 +------ src/infiniop/ops/mod/cpu/mod_cpu.cc | 45 +- src/infiniop/ops/mod/cpu/mod_cpu.h | 18 +- src/infiniop/ops/mod/cuda/kernel.cuh | 24 +- src/infiniop/ops/mod/nvidia/mod_nvidia.cu | 51 +- src/infiniop/ops/mod/operator.cc | 135 +---- src/infiniop/ops/neg/cpu/neg_cpu.cc | 44 +- src/infiniop/ops/neg/cpu/neg_cpu.h | 15 +- src/infiniop/ops/neg/cuda/kernel.cuh | 17 +- src/infiniop/ops/neg/nvidia/neg_nvidia.cu | 48 +- src/infiniop/ops/neg/operator.cc | 132 +---- src/infiniop/ops/pow/cpu/pow_cpu.cc | 45 +- src/infiniop/ops/pow/cpu/pow_cpu.h | 14 +- src/infiniop/ops/pow/cuda/kernel.cuh | 34 +- src/infiniop/ops/pow/nvidia/pow_nvidia.cu | 51 +- src/infiniop/ops/pow/operator.cc | 135 +---- .../ops/reciprocal/cpu/reciprocal_cpu.cc | 44 +- .../ops/reciprocal/cpu/reciprocal_cpu.h | 15 +- src/infiniop/ops/reciprocal/cuda/kernel.cuh | 26 +- .../reciprocal/nvidia/reciprocal_nvidia.cu | 48 +- src/infiniop/ops/reciprocal/operator.cc | 132 +---- src/infiniop/ops/round/cpu/round_cpu.cc | 44 +- src/infiniop/ops/round/cpu/round_cpu.h | 20 +- src/infiniop/ops/round/cuda/kernel.cuh | 28 +- src/infiniop/ops/round/nvidia/round_nvidia.cu | 48 +- src/infiniop/ops/round/operator.cc | 132 +---- src/infiniop/ops/sign/cpu/sign_cpu.cc | 44 +- src/infiniop/ops/sign/cpu/sign_cpu.h | 15 +- src/infiniop/ops/sign/cuda/kernel.cuh | 19 +- src/infiniop/ops/sign/nvidia/sign_nvidia.cu | 48 +- src/infiniop/ops/sign/operator.cc | 132 +---- src/infiniop/ops/sinh/cpu/sinh_cpu.cc | 44 +- src/infiniop/ops/sinh/cpu/sinh_cpu.h | 17 +- src/infiniop/ops/sinh/cuda/kernel.cuh | 26 +- src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu | 48 +- src/infiniop/ops/sinh/operator.cc | 132 +---- src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 44 +- src/infiniop/ops/sqrt/cpu/sqrt_cpu.h | 17 +- src/infiniop/ops/sqrt/cuda/kernel.cuh | 26 +- src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu | 48 +- src/infiniop/ops/sqrt/operator.cc | 132 +---- src/infiniop/ops/tan/cpu/tan_cpu.cc | 44 +- src/infiniop/ops/tan/cpu/tan_cpu.h | 17 +- src/infiniop/ops/tan/cuda/kernel.cuh | 49 +- src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 48 +- src/infiniop/ops/tan/operator.cc | 132 +---- src/infiniop/ops/tanh/cuda/kernel.cuh | 38 +- src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu | 53 +- 163 files changed, 1891 insertions(+), 7239 deletions(-) create mode 100644 include/infiniop/ops/binary_op_api.h create mode 100644 include/infiniop/ops/unary_op_api.h create mode 100755 scripts/test_binary_unary.py create mode 100644 src/infiniop/elementwise/binary.h create mode 100644 src/infiniop/elementwise/cpu/elementwise_cpu_impl.h create mode 100644 src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh create mode 100644 src/infiniop/elementwise/unary.h create mode 100644 src/infiniop/operator_impl.h diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h index 7b5872657..1d1f1cbd1 100644 --- a/include/infiniop/ops/abs.h +++ b/include/infiniop/ops/abs.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ABS_API_H__ #define __INFINIOP_ABS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle, - infiniopAbsDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc); +UNARY_OP_API_DECLARE(abs, Abs) #endif diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h index fe6af01ed..c2f4de837 100644 --- a/include/infiniop/ops/acos.h +++ b/include/infiniop/ops/acos.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ACOS_API_H__ #define __INFINIOP_ACOS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle, - infiniopAcosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc); +UNARY_OP_API_DECLARE(acos, Acos) #endif diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h index be28918bb..e8630b7d5 100644 --- a/include/infiniop/ops/acosh.h +++ b/include/infiniop/ops/acosh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ACOSH_API_H__ #define __INFINIOP_ACOSH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle, - infiniopAcoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc); +UNARY_OP_API_DECLARE(acosh, Acosh) #endif diff --git a/include/infiniop/ops/add.h b/include/infiniop/ops/add.h index 02f6225fb..abedb7f9d 100644 --- a/include/infiniop/ops/add.h +++ b/include/infiniop/ops/add.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_ADD_API_H__ #define __INFINIOP_ADD_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAddDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, - infiniopAddDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc); +BINARY_OP_API_DECLARE(add, Add) #endif diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h index 2aac6d1e1..1a8bdd7b8 100644 --- a/include/infiniop/ops/asin.h +++ b/include/infiniop/ops/asin.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ASIN_API_H__ #define __INFINIOP_ASIN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, - infiniopAsinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); +UNARY_OP_API_DECLARE(asin, Asin) #endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h index d1385fc01..2a3aebf5a 100644 --- a/include/infiniop/ops/asinh.h +++ b/include/infiniop/ops/asinh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ASINH_API_H__ #define __INFINIOP_ASINH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, - infiniopAsinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); +UNARY_OP_API_DECLARE(asinh, Asinh) #endif diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h index 3b1a5bde3..18eed316f 100644 --- a/include/infiniop/ops/atan.h +++ b/include/infiniop/ops/atan.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ATAN_API_H__ #define __INFINIOP_ATAN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle, - infiniopAtanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc); +UNARY_OP_API_DECLARE(atan, Atan) #endif diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h index 800afd5d5..e7db5b53c 100644 --- a/include/infiniop/ops/atanh.h +++ b/include/infiniop/ops/atanh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ATANH_API_H__ #define __INFINIOP_ATANH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle, - infiniopAtanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc); +UNARY_OP_API_DECLARE(atanh, Atanh) #endif diff --git a/include/infiniop/ops/binary_op_api.h b/include/infiniop/ops/binary_op_api.h new file mode 100644 index 000000000..4ab2401b9 --- /dev/null +++ b/include/infiniop/ops/binary_op_api.h @@ -0,0 +1,50 @@ +#ifndef __INFINIOP_BINARY_OP_API_H__ +#define __INFINIOP_BINARY_OP_API_H__ + +#include "../operator_descriptor.h" + +/** + * @brief Macro to generate the C API header for a binary operator. + * + * This macro generates all the necessary declarations for a binary operator: + * - Descriptor type definition + * - Create descriptor function + * - Get workspace size function + * - Execute operator function + * - Destroy descriptor function + * + * Usage: + * BINARY_OP_API_DECLARE(div, Div) + * BINARY_OP_API_DECLARE(pow, Pow) + * + * @param OP_NAME Lowercase operator name (e.g., div, pow, mod) + * @param OP_NAME_UPPER Uppercase operator name (e.g., Div, Pow, Mod) + */ +#define BINARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER) \ + \ + typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \ + \ + __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t c, \ + infiniopTensorDescriptor_t a, \ + infiniopTensorDescriptor_t b); \ + \ + __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size); \ + \ + __C __export infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream); \ + \ + __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc); + +#endif // __INFINIOP_BINARY_OP_API_H__ diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h index 4539d77fd..8fca73b2e 100644 --- a/include/infiniop/ops/ceil.h +++ b/include/infiniop/ops/ceil.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_CEIL_API_H__ #define __INFINIOP_CEIL_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle, - infiniopCeilDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc); +UNARY_OP_API_DECLARE(ceil, Ceil) #endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h index 8f0b6eeb7..ed33b0a0e 100644 --- a/include/infiniop/ops/cos.h +++ b/include/infiniop/ops/cos.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_COS_API_H__ #define __INFINIOP_COS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, - infiniopCosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); +UNARY_OP_API_DECLARE(cos, Cos) #endif diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h index 3328151ad..b607b8fd1 100644 --- a/include/infiniop/ops/cosh.h +++ b/include/infiniop/ops/cosh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_COSH_API_H__ #define __INFINIOP_COSH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle, - infiniopCoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc); +UNARY_OP_API_DECLARE(cosh, Cosh) #endif diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h index e539b440c..6f146bf4c 100644 --- a/include/infiniop/ops/div.h +++ b/include/infiniop/ops/div.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_DIV_API_H__ #define __INFINIOP_DIV_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; - -__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, - infiniopDivDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); +BINARY_OP_API_DECLARE(div, Div) #endif diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h index 8cbb8fb74..0dcc149da 100644 --- a/include/infiniop/ops/erf.h +++ b/include/infiniop/ops/erf.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ERF_API_H__ #define __INFINIOP_ERF_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; - -__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, - infiniopErfDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); +UNARY_OP_API_DECLARE(erf, Erf) #endif diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h index 2f65f8f4a..02efc6761 100644 --- a/include/infiniop/ops/floor.h +++ b/include/infiniop/ops/floor.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_FLOOR_API_H__ #define __INFINIOP_FLOOR_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t; - -__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle, - infiniopFloorDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc); +UNARY_OP_API_DECLARE(floor, Floor) #endif diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h index f5bec4382..3892ccb6e 100644 --- a/include/infiniop/ops/log.h +++ b/include/infiniop/ops/log.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_LOG_API_H__ #define __INFINIOP_LOG_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopLogDescriptor_t; - -__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle, - infiniopLogDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc); +UNARY_OP_API_DECLARE(log, Log) #endif diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h index e6f2f5d4c..4b91e5c83 100644 --- a/include/infiniop/ops/max.h +++ b/include/infiniop/ops/max.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MAX_API_H__ #define __INFINIOP_MAX_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t; - -__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, - infiniopMaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); +BINARY_OP_API_DECLARE(max, Max) #endif diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h index f72f0c4db..1496806df 100644 --- a/include/infiniop/ops/min.h +++ b/include/infiniop/ops/min.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MIN_API_H__ #define __INFINIOP_MIN_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopMinDescriptor_t; - -__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, - infiniopMinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); +BINARY_OP_API_DECLARE(min, Min) #endif diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h index 5a6cd5bbf..e4fcd571e 100644 --- a/include/infiniop/ops/mod.h +++ b/include/infiniop/ops/mod.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MOD_API_H__ #define __INFINIOP_MOD_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopModDescriptor_t; - -__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, - infiniopModDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); +BINARY_OP_API_DECLARE(mod, Mod) #endif diff --git a/include/infiniop/ops/mul.h b/include/infiniop/ops/mul.h index 06200b55b..2dfd92ef4 100644 --- a/include/infiniop/ops/mul.h +++ b/include/infiniop/ops/mul.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MUL_API_H__ #define __INFINIOP_MUL_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopMulDescriptor_t; - -__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, - infiniopMulDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc); +BINARY_OP_API_DECLARE(mul, Mul) #endif diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h index 4d3b06e21..0d18bbd5c 100644 --- a/include/infiniop/ops/neg.h +++ b/include/infiniop/ops/neg.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_NEG_API_H__ #define __INFINIOP_NEG_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopNegDescriptor_t; - -__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle, - infiniopNegDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc); +UNARY_OP_API_DECLARE(neg, Neg) #endif diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h index 6449d8622..f4e263a58 100644 --- a/include/infiniop/ops/pow.h +++ b/include/infiniop/ops/pow.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_POW_API_H__ #define __INFINIOP_POW_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopPowDescriptor_t; - -__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, - infiniopPowDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); +BINARY_OP_API_DECLARE(pow, Pow) #endif diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h index 73836fea4..7d5626176 100644 --- a/include/infiniop/ops/reciprocal.h +++ b/include/infiniop/ops/reciprocal.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_RECIPROCAL_API_H__ #define __INFINIOP_RECIPROCAL_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t; - -__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle, - infiniopReciprocalDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc); +UNARY_OP_API_DECLARE(reciprocal, Reciprocal) #endif diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h index 18c7fe44e..1bf4377ff 100644 --- a/include/infiniop/ops/round.h +++ b/include/infiniop/ops/round.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ROUND_API_H__ #define __INFINIOP_ROUND_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t; - -__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle, - infiniopRoundDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc); +UNARY_OP_API_DECLARE(round, Round) #endif diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h index fe47c7190..ef7854de8 100644 --- a/include/infiniop/ops/sign.h +++ b/include/infiniop/ops/sign.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SIGN_API_H__ #define __INFINIOP_SIGN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSignDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle, - infiniopSignDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc); +UNARY_OP_API_DECLARE(sign, Sign) #endif diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h index a5325fb81..ea8511a2b 100644 --- a/include/infiniop/ops/sinh.h +++ b/include/infiniop/ops/sinh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SINH_API_H__ #define __INFINIOP_SINH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle, - infiniopSinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc); +UNARY_OP_API_DECLARE(sinh, Sinh) #endif diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h index db04ec8bc..6df6fe89c 100644 --- a/include/infiniop/ops/sqrt.h +++ b/include/infiniop/ops/sqrt.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SQRT_API_H__ #define __INFINIOP_SQRT_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle, - infiniopSqrtDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc); +UNARY_OP_API_DECLARE(sqrt, Sqrt) #endif diff --git a/include/infiniop/ops/sub.h b/include/infiniop/ops/sub.h index da2aa8568..9b5fa397b 100644 --- a/include/infiniop/ops/sub.h +++ b/include/infiniop/ops/sub.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_SUB_API_H__ #define __INFINIOP_SUB_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSubDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, - infiniopSubDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSub(infiniopSubDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc); +BINARY_OP_API_DECLARE(sub, Sub) #endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h index 69fc47bf1..d4a2f0bf2 100644 --- a/include/infiniop/ops/tan.h +++ b/include/infiniop/ops/tan.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_TAN_API_H__ #define __INFINIOP_TAN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; - -__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle, - infiniopTanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); +UNARY_OP_API_DECLARE(tan, Tan) #endif diff --git a/include/infiniop/ops/unary_op_api.h b/include/infiniop/ops/unary_op_api.h new file mode 100644 index 000000000..eefe3c3a4 --- /dev/null +++ b/include/infiniop/ops/unary_op_api.h @@ -0,0 +1,48 @@ +#ifndef __INFINIOP_UNARY_OP_API_H__ +#define __INFINIOP_UNARY_OP_API_H__ + +#include "../operator_descriptor.h" + +/** + * @brief Macro to generate the C API header for a unary operator. + * + * This macro generates all the necessary declarations for a unary operator: + * - Descriptor type definition + * - Create descriptor function + * - Get workspace size function + * - Execute operator function + * - Destroy descriptor function + * + * Usage: + * UNARY_OP_API_DECLARE(abs, Abs) + * UNARY_OP_API_DECLARE(log, Log) + * + * @param OP_NAME Lowercase operator name (e.g., abs, log, sin) + * @param OP_NAME_UPPER Uppercase operator name (e.g., Abs, Log, Sin) + */ +#define UNARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER) \ + \ + typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \ + \ + __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t y, \ + infiniopTensorDescriptor_t x); \ + \ + __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size); \ + \ + __C __export infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream); \ + \ + __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc); + +#endif // __INFINIOP_UNARY_OP_API_H__ diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py new file mode 100755 index 000000000..8dbbfbf53 --- /dev/null +++ b/scripts/test_binary_unary.py @@ -0,0 +1,143 @@ +import os +import subprocess +from set_env import set_env +import sys + +PROJECT_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test", "infiniop") +) +os.chdir(PROJECT_DIR) + + +def run_tests(args): + failed = [] + + # Binary operators (重构过的) + binary_tests = [ + "div.py", + "pow.py", + "mod.py", + "min.py", + "max.py", + ] + + # Unary operators (重构过的) + unary_tests = [ + "abs.py", + "log.py", + "cos.py", + "sqrt.py", + "neg.py", + "sign.py", + "reciprocal.py", + "round.py", + "floor.py", + "ceil.py", + "erf.py", + "cosh.py", + "sinh.py", + "tan.py", + "acos.py", + "acosh.py", + "asin.py", + "asinh.py", + "atan.py", + "atanh.py", + ] + + all_tests = binary_tests + unary_tests + + print("\033[94m" + "=" * 60 + "\033[0m") + print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m") + print("\033[94m" + "=" * 60 + "\033[0m") + print(f"\033[94mTotal tests: {len(all_tests)}\033[0m") + print(f"\033[94m - Binary operators: {len(binary_tests)}\033[0m") + print(f"\033[94m - Unary operators: {len(unary_tests)}\033[0m") + print() + + for test in all_tests: + if not os.path.exists(test): + print(f"\033[93m[SKIP] {test} - test file not found\033[0m") + continue + + print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True) + result = subprocess.run( + f"python3 {test} {args}", + text=True, + encoding="utf-8", + shell=True, + capture_output=True + ) + + if result.returncode != 0: + print(f"\033[91m[FAIL]\033[0m") + print(f"\033[91mError output:\033[0m") + print(result.stderr) + failed.append(test) + else: + print(f"\033[92m[PASS]\033[0m") + + return failed + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Test refactored binary and unary operators", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Test on CPU only (default) + python3 scripts/test_binary_unary.py --cpu + + # Test on NVIDIA GPU only + python3 scripts/test_binary_unary.py --nvidia + + # Test on both CPU and NVIDIA + python3 scripts/test_binary_unary.py --cpu --nvidia + + # Test with debug mode + python3 scripts/test_binary_unary.py --cpu --debug + + # Test with profiling + python3 scripts/test_binary_unary.py --nvidia --profile + """ + ) + + # Device selection arguments (same as test files) + parser.add_argument("--cpu", action="store_true", help="Run CPU tests") + parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests") + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + parser.add_argument("--profile", action="store_true", help="Enable profiling") + + args, unknown = parser.parse_known_args() + + # Build command line arguments to pass to test files + test_args = [] + if args.cpu: + test_args.append("--cpu") + if args.nvidia: + test_args.append("--nvidia") + if args.debug: + test_args.append("--debug") + if args.profile: + test_args.append("--profile") + + # Add any unknown arguments (for compatibility) + test_args.extend(unknown) + + set_env() + failed = run_tests(" ".join(test_args)) + + print() + print("\033[94m" + "=" * 60 + "\033[0m") + if len(failed) == 0: + print("\033[92m✓ All tests passed!\033[0m") + else: + print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m") + for test in failed: + print(f"\033[91m - {test}\033[0m") + print("\033[94m" + "=" * 60 + "\033[0m") + + exit(len(failed)) diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h new file mode 100644 index 000000000..1823fac3f --- /dev/null +++ b/src/infiniop/elementwise/binary.h @@ -0,0 +1,261 @@ +#ifndef __INFINIOP_ELEMENTWISE_BINARY_H__ +#define __INFINIOP_ELEMENTWISE_BINARY_H__ + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include +#include +// Include device-specific type aliases for cuda_bfloat16 +#include "../devices/nvidia/nvidia_kernel_common.cuh" +#endif + +namespace op::elementwise::binary { + +/** + * @brief Represents all the currently defined binary operations. + * + * This enum is used to specify which binary operation to perform + * in the generic BinaryOp template. + */ +enum class BinaryMode { + // Arithmetic operations: + Add, + Subtract, + Multiply, + Divide, + Pow, + Mod, + Max, + Min, + // Logical operations (for future use): + // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual +}; + +/** + * @brief Generic binary operation template that performs different operations + * based on the specified BinaryMode. + * + * This template allows multiple binary operators (pow, div, mod, min, max, etc.) + * to share the same implementation infrastructure while only differing in the + * operation mode. + * + * @tparam Mode The binary operation mode (from BinaryMode enum) + */ +template +struct BinaryOp { + static constexpr size_t num_inputs = 2; + + template + T operator()(const T &a, const T &b) const { + if constexpr (Mode == BinaryMode::Add) { + return a + b; + } else if constexpr (Mode == BinaryMode::Subtract) { + return a - b; + } else if constexpr (Mode == BinaryMode::Multiply) { + return a * b; + } else if constexpr (Mode == BinaryMode::Divide) { + return a / b; + } else if constexpr (Mode == BinaryMode::Pow) { + return std::pow(a, b); + } else if constexpr (Mode == BinaryMode::Mod) { + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } else if constexpr (Mode == BinaryMode::Max) { + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + } else if constexpr (Mode == BinaryMode::Min) { + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + } else { + static_assert(Mode != Mode, "Unsupported binary operation mode"); + return a; + } + } +}; + +#ifdef __CUDACC__ +/** + * @brief CUDA-specific binary operation template that performs different operations + * based on the specified BinaryMode, using CUDA-optimized functions. + * + * This template provides CUDA device functions optimized for GPU execution, + * using intrinsics like __powf, __h2div, __hmin2, __hmax2, etc. + * + * @tparam Mode The binary operation mode (from BinaryMode enum) + */ +namespace cuda { +template +struct BinaryOp { + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (Mode == BinaryMode::Add) { + if constexpr (std::is_same_v) { + return __hadd2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hadd(a, b); + } else if constexpr (std::is_same_v) { + return __fadd_rn(a, b); + } else { + return a + b; + } + } else if constexpr (Mode == BinaryMode::Subtract) { + if constexpr (std::is_same_v) { + return __hsub2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hsub(a, b); + } else if constexpr (std::is_same_v) { + return __fsub_rn(a, b); + } else { + return a - b; + } + } else if constexpr (Mode == BinaryMode::Multiply) { + if constexpr (std::is_same_v) { + return __hmul2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hmul(a, b); + } else if constexpr (std::is_same_v) { + return __fmul_rd(a, b); + } else { + return a * b; + } + } else if constexpr (Mode == BinaryMode::Divide) { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a / b; + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + } else if constexpr (Mode == BinaryMode::Pow) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(__powf(a_, b_)); + } else if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } + } else if constexpr (Mode == BinaryMode::Mod) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(std::fmod(a_, b_)); + } else if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } else if constexpr (Mode == BinaryMode::Max) { + if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a > b ? a : b; + } else if constexpr (std::is_same_v) { + return fmaxf(a, b); + } else { + return a > b ? a : b; + } + } else if constexpr (Mode == BinaryMode::Min) { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a < b ? a : b; + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } else { + static_assert(Mode != Mode, "Unsupported binary operation mode"); + return a; + } + } +}; +} // namespace cuda +#endif // __CUDACC__ + +/** + * @brief Macro to define a binary elementwise descriptor for a specific operation. + * + * This macro simplifies the definition of binary operators (pow, div, mod, min, max, etc.) + * by automatically generating the Descriptor class and operation struct using the + * ELEMENTWISE_DESCRIPTOR macro and BinaryOp template. + * + * Usage: + * BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, BinaryMode::Pow) + * BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, BinaryMode::Divide) + * + * @param OP The operator name (e.g., pow, div, mod) + * @param NAMESPACE The device namespace (e.g., cpu, nvidia) + * @param MODE The BinaryMode enum value for this operation + */ +#define BINARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::NAMESPACE { \ + using Op = op::elementwise::binary::BinaryOp; \ + } + +/** + * @brief Macro to define a binary elementwise descriptor for CUDA/NVIDIA backend. + * + * This macro is similar to BINARY_ELEMENTWISE_DESCRIPTOR but uses the CUDA-specific + * BinaryOp implementation for better GPU performance. + * + * Usage: + * BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(pow, nvidia, BinaryMode::Pow) + * BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(div, nvidia, BinaryMode::Divide) + * + * @param OP The operator name (e.g., pow, div, mod) + * @param NAMESPACE The device namespace (e.g., nvidia) + * @param MODE The BinaryMode enum value for this operation + */ +#ifdef __CUDACC__ +#define BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::cuda { \ + using Op = op::elementwise::binary::cuda::BinaryOp; \ + } +#endif // __CUDACC__ + +} // namespace op::elementwise::binary + +#endif // __INFINIOP_ELEMENTWISE_BINARY_H__ diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h new file mode 100644 index 000000000..030f4d87e --- /dev/null +++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h @@ -0,0 +1,130 @@ +#ifndef __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ +#define __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ + +#include "../../../utils/check.h" +#include "../../../utils/result.hpp" +#include "../../devices/cpu/common_cpu.h" +#include "elementwise_cpu.h" + +/** + * @brief Generic implementation for elementwise CPU operators. + * + * This file provides a generic implementation template that can be used + * by all binary and unary operators to reduce code duplication. + * + * Usage: + * #include "elementwise_cpu_impl.h" + * namespace op::pow::cpu { + * using Op = op::elementwise::binary::BinaryOp; + * ELEMENTWISE_CPU_IMPL_BINARY(pow) + * } + * + * namespace op::sqrt::cpu { + * using Op = op::elementwise::unary::UnaryOp; + * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) + * } + */ + +/** + * @brief Macro to generate binary operator implementation. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators, using the generic implementation. + * + * Usage: + * namespace op::pow::cpu { + * using Op = op::elementwise::binary::BinaryOp; + * ELEMENTWISE_CPU_IMPL_BINARY(pow) + * } + */ +#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &out_shape = out_desc->shape(); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); \ + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Macro to generate unary operator implementation. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators, using the generic implementation. + * + * Usage: + * namespace op::sqrt::cpu { + * using Op = op::elementwise::unary::UnaryOp; + * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) + * } + */ +#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &y_shape = out_desc->shape(); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(y_shape, x_shape); \ + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +#endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh new file mode 100644 index 000000000..39b78884a --- /dev/null +++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh @@ -0,0 +1,134 @@ +#ifndef __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ +#define __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ + +#include "../../../utils/check.h" +#include "../../../utils/result.hpp" +#include "../../devices/nvidia/nvidia_common.cuh" +#include "elementwise_nvidia.cuh" +#include +#include + +/** + * @brief Generic implementation for elementwise NVIDIA/CUDA operators. + * + * This file provides a generic implementation template that can be used + * by all binary and unary operators to reduce code duplication. + * + * Usage: + * #include "elementwise_nvidia_impl.cuh" + * namespace op::pow::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) + * } + * + * namespace op::sqrt::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) + * } + */ + +/** + * @brief Macro to generate binary operator implementation for NVIDIA/CUDA. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators, using the generic implementation. + * + * Usage: + * namespace op::pow::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &c_shape = out_desc->shape(); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); \ + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ + } \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>( \ + _info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>( \ + _info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Macro to generate unary operator implementation for NVIDIA/CUDA. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators, using the generic implementation. + * + * Usage: + * namespace op::sqrt::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &y_shape = out_desc->shape(); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(y_shape, x_shape); \ + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ + } \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>( \ + _info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>( \ + _info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +#endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h new file mode 100644 index 000000000..9f41dedb2 --- /dev/null +++ b/src/infiniop/elementwise/unary.h @@ -0,0 +1,524 @@ +#ifndef __INFINIOP_ELEMENTWISE_UNARY_H__ +#define __INFINIOP_ELEMENTWISE_UNARY_H__ + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include +#include +// Include device-specific type aliases for cuda_bfloat16 +#include "../devices/nvidia/nvidia_kernel_common.cuh" +#endif + +namespace op::elementwise::unary { + +/** + * @brief Represents all the currently defined unary operations. + * + * This enum is used to specify which unary operation to perform + * in the generic UnaryOp template. + */ +enum class UnaryMode { + // Math operations: + Abs, + Exp, + Log, + Reciprocal, + Sqrt, + Neg, + Ceil, + Floor, + Round, + Sin, + Cos, + Tan, + Asin, + Acos, + Atan, + Sinh, + Cosh, + Tanh, + Asinh, + Acosh, + Atanh, + Relu, + Sigmoid, + Sign, + Erf, +}; + +/** + * @brief Generic unary operation template that performs different operations + * based on the specified UnaryMode. + * + * This template allows multiple unary operators (abs, log, sin, cos, etc.) + * to share the same implementation infrastructure while only differing in the + * operation mode. + * + * @tparam Mode The unary operation mode (from UnaryMode enum) + */ +template +struct UnaryOp { + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (Mode == UnaryMode::Abs) { + if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } else if constexpr (Mode == UnaryMode::Exp) { + return std::exp(x); + } else if constexpr (Mode == UnaryMode::Log) { + return std::log(x); + } else if constexpr (Mode == UnaryMode::Reciprocal) { + return T(1) / x; + } else if constexpr (Mode == UnaryMode::Sqrt) { + return std::sqrt(x); + } else if constexpr (Mode == UnaryMode::Neg) { + return -x; + } else if constexpr (Mode == UnaryMode::Ceil) { + return std::ceil(x); + } else if constexpr (Mode == UnaryMode::Floor) { + return std::floor(x); + } else if constexpr (Mode == UnaryMode::Round) { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } else if constexpr (Mode == UnaryMode::Sin) { + return std::sin(x); + } else if constexpr (Mode == UnaryMode::Cos) { + return std::cos(x); + } else if constexpr (Mode == UnaryMode::Tan) { + return std::tan(x); + } else if constexpr (Mode == UnaryMode::Asin) { + return std::asin(x); + } else if constexpr (Mode == UnaryMode::Acos) { + return std::acos(x); + } else if constexpr (Mode == UnaryMode::Atan) { + return std::atan(x); + } else if constexpr (Mode == UnaryMode::Sinh) { + return std::sinh(x); + } else if constexpr (Mode == UnaryMode::Cosh) { + return std::cosh(x); + } else if constexpr (Mode == UnaryMode::Tanh) { + return std::tanh(x); + } else if constexpr (Mode == UnaryMode::Asinh) { + return std::asinh(x); + } else if constexpr (Mode == UnaryMode::Acosh) { + return std::acosh(x); + } else if constexpr (Mode == UnaryMode::Atanh) { + return std::atanh(x); + } else if constexpr (Mode == UnaryMode::Relu) { + return x > T(0) ? x : T(0); + } else if constexpr (Mode == UnaryMode::Sigmoid) { + return T(1) / (T(1) + std::exp(-x)); + } else if constexpr (Mode == UnaryMode::Sign) { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } else if constexpr (Mode == UnaryMode::Erf) { + return std::erf(x); + } else { + static_assert(Mode != Mode, "Unsupported unary operation mode"); + return x; + } + } +}; + +#ifdef __CUDACC__ +/** + * @brief CUDA-specific unary operation template that performs different operations + * based on the specified UnaryMode, using CUDA-optimized functions. + * + * This template provides CUDA device functions optimized for GPU execution, + * using intrinsics like __habs2, __logf, __sinf, etc. + * + * @tparam Mode The unary operation mode (from UnaryMode enum) + */ +namespace cuda { +template +struct UnaryOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (Mode == UnaryMode::Abs) { + if constexpr (std::is_same_v) { + return __habs2(x); + } else if constexpr (std::is_same_v) { + return __habs(x); + } else if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } else if constexpr (Mode == UnaryMode::Exp) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__expf(x_f2.x), __expf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__expf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float2 x_f2 = __bfloat1622float2(x); + return __floats2bfloat162_rn(__expf(x_f2.x), __expf(x_f2.y)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__expf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __expf(x); + } else { + return std::exp(x); + } + } else if constexpr (Mode == UnaryMode::Log) { + if constexpr (std::is_same_v) { + return h2log(x); + } else if constexpr (std::is_same_v) { + return __float2half(__logf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(logf(x0), logf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(logf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __logf(x); + } else { + return std::log(x); + } + } else if constexpr (Mode == UnaryMode::Reciprocal) { + if constexpr (std::is_same_v) { + return h2rcp(x); + } else if constexpr (std::is_same_v) { + return hrcp(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(x); + } else { + return T(1) / x; + } + } else if constexpr (Mode == UnaryMode::Sqrt) { + if constexpr (std::is_same_v) { + return h2sqrt(x); + } else if constexpr (std::is_same_v) { + return hsqrt(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sqrtf(x0), sqrtf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sqrtf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __fsqrt_rn(x); + } else { + return std::sqrt(x); + } + } else if constexpr (Mode == UnaryMode::Neg) { + if constexpr (std::is_same_v) { + return __hneg2(x); + } else if constexpr (std::is_same_v) { + return __hneg(x); + } else { + return -x; + } + } else if constexpr (Mode == UnaryMode::Ceil) { + if constexpr (std::is_same_v) { + return h2ceil(x); + } else if constexpr (std::is_same_v) { + return hceil(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(ceilf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return ceilf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } else if constexpr (Mode == UnaryMode::Floor) { + if constexpr (std::is_same_v) { + return h2floor(x); + } else if constexpr (std::is_same_v) { + return hfloor(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(floorf(x0), floorf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(floorf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return floorf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } else if constexpr (Mode == UnaryMode::Round) { + if constexpr (std::is_same_v) { + return h2rint(x); + } else if constexpr (std::is_same_v) { + return hrint(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(rintf(x0), rintf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(rintf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return rintf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } else if constexpr (Mode == UnaryMode::Sin) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__sinf(x_f2.x), __sinf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__sinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sinf(x0), sinf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sinf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __sinf(x); + } else { + return std::sin(x); + } + } else if constexpr (Mode == UnaryMode::Cos) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__cosf(x_f2.x), __cosf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__cosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(cosf(x0), cosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(cosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __cosf(x); + } else { + return std::cos(x); + } + } else if constexpr (Mode == UnaryMode::Tan) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(tanf(x_f2.x), tanf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(tanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return tanf(x); + } else { + return std::tan(x); + } + } else if constexpr (Mode == UnaryMode::Asin) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(asinf(x_f2.x), asinf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(asinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return asinf(x); + } else { + return std::asin(x); + } + } else if constexpr (Mode == UnaryMode::Acos) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(acosf(x_f2.x), acosf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(acosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return acosf(x); + } else { + return std::acos(x); + } + } else if constexpr (Mode == UnaryMode::Atan) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(atanf(x_f2.x), atanf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(atanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return atanf(x); + } else { + return std::atan(x); + } + } else if constexpr (Mode == UnaryMode::Sinh) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(sinhf(x_f2.x), sinhf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(sinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return sinhf(x); + } else { + return std::sinh(x); + } + } else if constexpr (Mode == UnaryMode::Cosh) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(coshf(x_f2.x), coshf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(coshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return coshf(x); + } else { + return std::cosh(x); + } + } else if constexpr (Mode == UnaryMode::Tanh) { + if constexpr (std::is_same_v) { + return __h2tanh(x); + } else if constexpr (std::is_same_v) { + return __float2half(tanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(x)); + float f1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(tanhf(f0), tanhf(f1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(tanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return tanhf(x); + } else { + return std::tanh(x); + } + } else if constexpr (Mode == UnaryMode::Asinh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinhf(x); + } else { + return std::asinh(x); + } + } else if constexpr (Mode == UnaryMode::Acosh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acoshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acoshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acoshf(x); + } else { + return std::acosh(x); + } + } else if constexpr (Mode == UnaryMode::Atanh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanhf(x); + } else { + return std::atanh(x); + } + } else if constexpr (Mode == UnaryMode::Relu) { + if constexpr (std::is_same_v) { + return __hmax2(x, __floats2half2_rn(0.0f, 0.0f)); + } else { + return x > T(0) ? x : T(0); + } + } else if constexpr (Mode == UnaryMode::Sigmoid) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y)); + return __float22half2_rn(make_float2(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y))); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + return __float2half(1.0f / (1.0f + __expf(-x_))); + } else if constexpr (std::is_same_v) { + return 1.0f / (1.0f + __expf(-x)); + } else { + return T(1) / (T(1) + std::exp(-x)); + } + } else if constexpr (Mode == UnaryMode::Sign) { + if constexpr (std::is_same_v) { + const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); + return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); + } else if constexpr (std::is_same_v) { + return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); + } else { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } + } else if constexpr (Mode == UnaryMode::Erf) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(erff(x_f2.x), erff(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(erff(__half2float(x))); + } else if constexpr (std::is_same_v) { + return erff(x); + } else { + return std::erf(x); + } + } else { + static_assert(Mode != Mode, "Unsupported unary operation mode"); + return x; + } + } +}; +} // namespace cuda +#endif // __CUDACC__ + +/** + * @brief Macro to define a unary elementwise descriptor for a specific operation. + * + * This macro simplifies the definition of unary operators (abs, log, sin, cos, etc.) + * by automatically generating the Descriptor class and operation struct using the + * ELEMENTWISE_DESCRIPTOR macro and UnaryOp template. + * + * Usage: + * UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, UnaryMode::Abs) + * UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, UnaryMode::Log) + * + * @param OP The operator name (e.g., abs, log, sin) + * @param NAMESPACE The device namespace (e.g., cpu, nvidia) + * @param MODE The UnaryMode enum value for this operation + */ +#define UNARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::NAMESPACE { \ + using Op = op::elementwise::unary::UnaryOp; \ + } + +} // namespace op::elementwise::unary + +#endif // __INFINIOP_ELEMENTWISE_UNARY_H__ diff --git a/src/infiniop/operator_impl.h b/src/infiniop/operator_impl.h new file mode 100644 index 000000000..3ff543f7e --- /dev/null +++ b/src/infiniop/operator_impl.h @@ -0,0 +1,288 @@ +#ifndef __INFINIOP_OPERATOR_IMPL_H__ +#define __INFINIOP_OPERATOR_IMPL_H__ + +#include "handle.h" +#include "operator.h" + +// Conditional compilation helpers +#ifdef ENABLE_CPU_API +#define IF_ENABLE_CPU_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_CPU_API(...) +#endif + +#ifdef ENABLE_NVIDIA_API +#define IF_ENABLE_NVIDIA_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_NVIDIA_API(...) +#endif + +#ifdef ENABLE_ILUVATAR_API +#define IF_ENABLE_ILUVATAR_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_ILUVATAR_API(...) +#endif + +#ifdef ENABLE_QY_API +#define IF_ENABLE_QY_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_QY_API(...) +#endif + +#ifdef ENABLE_METAX_API +#define IF_ENABLE_METAX_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_METAX_API(...) +#endif + +#ifdef ENABLE_KUNLUN_API +#define IF_ENABLE_KUNLUN_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_KUNLUN_API(...) +#endif + +#ifdef ENABLE_CAMBRICON_API +#define IF_ENABLE_CAMBRICON_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_CAMBRICON_API(...) +#endif + +#ifdef ENABLE_MOORE_API +#define IF_ENABLE_MOORE_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_MOORE_API(...) +#endif + +/** + * Binary operator implementation macros + */ +#define BINARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, c_desc, a_desc, b_desc) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return op::OP_NAME::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc});) + +#define BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, METAX, metax, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, c_desc, a_desc, b_desc) + +#define BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS;) + +#define BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore) + +#define BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, c, a, b) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream);) + +#define BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, c, a, b) + +#define BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS;) + +#define BINARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore) + +#define BINARY_OP_IMPL(OP_NAME, OP_NAME_UPPER) \ + __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc) { \ + switch (handle->device) { \ + BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + __C infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } + +/** + * Unary operator implementation macros + */ +#define UNARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, y_desc, x_desc) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return op::OP_NAME::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc});) + +#define UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, METAX, metax, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, y_desc, x_desc) + +#define UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS;) + +#define UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore) + +#define UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, y, x) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream);) + +#define UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, y, x) + +#define UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS;) + +#define UNARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore) + +#define UNARY_OP_IMPL(OP_NAME, OP_NAME_UPPER) \ + __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc) { \ + switch (handle->device) { \ + UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + __C infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } + +#endif // __INFINIOP_OPERATOR_IMPL_H__ diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc index 7d6e81d04..d4b541ba7 100644 --- a/src/infiniop/ops/abs/cpu/abs_cpu.cc +++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc @@ -1,48 +1,8 @@ #include "abs_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::abs::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(abs) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::abs::cpu diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h index 5b9773298..cba8274e6 100644 --- a/src/infiniop/ops/abs/cpu/abs_cpu.h +++ b/src/infiniop/ops/abs/cpu/abs_cpu.h @@ -1,26 +1,9 @@ #ifndef __ABS_CPU_H__ #define __ABS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(abs, cpu) - -namespace op::abs::cpu { -typedef struct AbsOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_floating_point_v) { - return std::fabs(x); - } else { - return std::abs(x); - } - } -} AbsOp; -} // namespace op::abs::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs) #endif // __ABS_CPU_H__ diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh index d7ff2db12..406aa423f 100644 --- a/src/infiniop/ops/abs/cuda/kernel.cuh +++ b/src/infiniop/ops/abs/cuda/kernel.cuh @@ -1,26 +1,10 @@ #ifndef __ABS_CUDA_H__ #define __ABS_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::abs::cuda { -typedef struct AbsOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __habs2(x); - } else if constexpr (std::is_same_v) { - return __habs(x); - } else if constexpr (std::is_floating_point_v) { - return std::fabs(x); - } else { - return std::abs(x); - } - } -} AbsOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::abs::cuda #endif // __ABS_CUDA_H__ diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu index 485f0406a..b9687226a 100644 --- a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "abs_nvidia.cuh" namespace op::abs::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(abs) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::abs::nvidia diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc index b6820079d..051b8711a 100644 --- a/src/infiniop/ops/abs/operator.cc +++ b/src/infiniop/ops/abs/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/abs.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/abs_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAbsDescriptor( - infiniopHandle_t handle, - infiniopAbsDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::abs::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAbs( - infiniopAbsDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(abs, Abs) diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc index 1accb6752..9be4ca1fe 100644 --- a/src/infiniop/ops/acos/cpu/acos_cpu.cc +++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc @@ -1,48 +1,8 @@ #include "acos_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::acos::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(acos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acos::cpu diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h index 14e74b75c..50900e217 100644 --- a/src/infiniop/ops/acos/cpu/acos_cpu.h +++ b/src/infiniop/ops/acos/cpu/acos_cpu.h @@ -1,22 +1,9 @@ #ifndef __ACOS_CPU_H__ #define __ACOS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(acos, cpu) - -namespace op::acos::cpu { -typedef struct AcosOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::acos(x); - } -} AcosOp; -} // namespace op::acos::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(acos, cpu, op::elementwise::unary::UnaryMode::Acos) #endif // __ACOS_CPU_H__ diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh index c3281c7e3..b62bf1e88 100644 --- a/src/infiniop/ops/acos/cuda/kernel.cuh +++ b/src/infiniop/ops/acos/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ACOS_CUDA_H__ #define __ACOS_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::acos::cuda { -typedef struct AcosOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(acosf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(acosf(x0), acosf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(acosf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return acosf(x); - } else { - return std::acos(x); - } - } -} AcosOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::acos::cuda #endif // __ACOS_CUDA_H__ diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu index 8480219bc..e7cf1feea 100644 --- a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "acos_nvidia.cuh" namespace op::acos::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(acos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acos::nvidia diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc index e775a005a..15872b754 100644 --- a/src/infiniop/ops/acos/operator.cc +++ b/src/infiniop/ops/acos/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/acos.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/acos_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAcosDescriptor( - infiniopHandle_t handle, - infiniopAcosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::acos::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAcos( - infiniopAcosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(acos, Acos) diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc index 005463679..0cb424c00 100644 --- a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc @@ -1,48 +1,8 @@ #include "acosh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::acosh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(acosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acosh::cpu diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h index b4b710ed5..bb05baf14 100644 --- a/src/infiniop/ops/acosh/cpu/acosh_cpu.h +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ACOSH_CPU_H__ #define __ACOSH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(acosh, cpu) - -namespace op::acosh::cpu { -typedef struct AcoshOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::acosh(x); - } -} AcoshOp; -} // namespace op::acosh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(acosh, cpu, op::elementwise::unary::UnaryMode::Acosh) #endif // __ACOSH_CPU_H__ diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh index fe444b1b4..9fbb54636 100644 --- a/src/infiniop/ops/acosh/cuda/kernel.cuh +++ b/src/infiniop/ops/acosh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ACOSH_CUDA_H__ #define __ACOSH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::acosh::cuda { -typedef struct AcoshOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(acoshf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(acoshf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return acoshf(x); - } else { - return std::acosh(x); - } - } -} AcoshOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::acosh::cuda #endif // __ACOSH_CUDA_H__ diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu index fc06590a7..5d065bdbc 100644 --- a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "acosh_nvidia.cuh" namespace op::acosh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(acosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acosh::nvidia diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc index 9bba3389a..c1939a54c 100644 --- a/src/infiniop/ops/acosh/operator.cc +++ b/src/infiniop/ops/acosh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/acosh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/acosh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAcoshDescriptor( - infiniopHandle_t handle, - infiniopAcoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::acosh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAcosh( - infiniopAcoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(acosh, Acosh) diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc index e149044f1..de42639ff 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.cc +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -1,48 +1,8 @@ #include "asin_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::asin::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(asin) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asin::cpu diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h index 22bcba337..8c6da5e20 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.h +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -1,22 +1,9 @@ #ifndef __ASIN_CPU_H__ #define __ASIN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(asin, cpu) - -namespace op::asin::cpu { -typedef struct AsinOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::asin(x); - } -} AsinOp; -} // namespace op::asin::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(asin, cpu, op::elementwise::unary::UnaryMode::Asin) #endif // __ASIN_CPU_H__ diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh index 3e8d11a07..a7063f015 100644 --- a/src/infiniop/ops/asin/cuda/kernel.cuh +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ASIN_CUDA_H__ #define __ASIN_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::asin::cuda { -typedef struct AsinOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(asinf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(asinf(x0), asinf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(asinf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return asinf(x); - } else { - return std::asin(x); - } - } -} AsinOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::asin::cuda #endif // __ASIN_CUDA_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu index 714d2b1b3..262755d50 100644 --- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "asin_nvidia.cuh" namespace op::asin::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(asin) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asin::nvidia diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc index c4973e9f5..edb8fa867 100644 --- a/src/infiniop/ops/asin/operator.cc +++ b/src/infiniop/ops/asin/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/asin.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/asin_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAsinDescriptor( - infiniopHandle_t handle, - infiniopAsinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::asin::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAsin( - infiniopAsinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(asin, Asin) diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc index e0d5b749a..8b18ab6f8 100644 --- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc @@ -1,48 +1,8 @@ #include "asinh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::asinh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(asinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asinh::cpu diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h index 0a999b63b..4c3603752 100644 --- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ASINH_CPU_H__ #define __ASINH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(asinh, cpu) - -namespace op::asinh::cpu { -typedef struct AsinhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::asinh(x); - } -} AsinhOp; -} // namespace op::asinh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(asinh, cpu, op::elementwise::unary::UnaryMode::Asinh) #endif // __ASINH_CPU_H__ diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh index 7cb018c8a..866ea147a 100644 --- a/src/infiniop/ops/asinh/cuda/kernel.cuh +++ b/src/infiniop/ops/asinh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ASINH_CUDA_H__ #define __ASINH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::asinh::cuda { -typedef struct AsinhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(asinhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(asinhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return asinhf(x); - } else { - return std::asinh(x); - } - } -} AsinhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::asinh::cuda #endif // __ASINH_CUDA_H__ diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu index 203008b81..37c44baf0 100644 --- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "asinh_nvidia.cuh" namespace op::asinh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(asinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asinh::nvidia diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc index d9ff5beda..7b519ec05 100644 --- a/src/infiniop/ops/asinh/operator.cc +++ b/src/infiniop/ops/asinh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/asinh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/asinh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAsinhDescriptor( - infiniopHandle_t handle, - infiniopAsinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::asinh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAsinh( - infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(asinh, Asinh) diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc index a8c613d1e..075c7fd4e 100644 --- a/src/infiniop/ops/atan/cpu/atan_cpu.cc +++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc @@ -1,48 +1,8 @@ #include "atan_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::atan::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(atan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atan::cpu diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h index ac2a1bc0c..6b333cfb1 100644 --- a/src/infiniop/ops/atan/cpu/atan_cpu.h +++ b/src/infiniop/ops/atan/cpu/atan_cpu.h @@ -1,22 +1,9 @@ #ifndef __ATAN_CPU_H__ #define __ATAN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(atan, cpu) - -namespace op::atan::cpu { -typedef struct AtanOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::atan(x); - } -} AtanOp; -} // namespace op::atan::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(atan, cpu, op::elementwise::unary::UnaryMode::Atan) #endif // __ATAN_CPU_H__ diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh index 0c7745196..ce553c1c1 100644 --- a/src/infiniop/ops/atan/cuda/kernel.cuh +++ b/src/infiniop/ops/atan/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ATAN_CUDA_H__ #define __ATAN_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::atan::cuda { -typedef struct AtanOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(atanf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(atanf(x0), atanf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(atanf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return atanf(x); - } else { - return std::atan(x); - } - } -} AtanOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::atan::cuda #endif // __ATAN_CUDA_H__ diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu index 2c6cf53d4..a05d65b79 100644 --- a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "atan_nvidia.cuh" namespace op::atan::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(atan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atan::nvidia diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc index c56e101d2..9025489c3 100644 --- a/src/infiniop/ops/atan/operator.cc +++ b/src/infiniop/ops/atan/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/atan.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/atan_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAtanDescriptor( - infiniopHandle_t handle, - infiniopAtanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::atan::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAtan( - infiniopAtanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(atan, Atan) diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc index 66ef4b1df..d19c978e4 100644 --- a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc @@ -1,48 +1,8 @@ #include "atanh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::atanh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(atanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atanh::cpu diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h index 8c2b04755..1a37453f0 100644 --- a/src/infiniop/ops/atanh/cpu/atanh_cpu.h +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ATANH_CPU_H__ #define __ATANH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(atanh, cpu) - -namespace op::atanh::cpu { -typedef struct AtanhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::atanh(x); - } -} AtanhOp; -} // namespace op::atanh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(atanh, cpu, op::elementwise::unary::UnaryMode::Atanh) #endif // __ATANH_CPU_H__ diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh index 5337d8243..de0866ba5 100644 --- a/src/infiniop/ops/atanh/cuda/kernel.cuh +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ATANH_CUDA_H__ #define __ATANH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::atanh::cuda { -typedef struct AtanhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(atanhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(atanhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return atanhf(x); - } else { - return std::atanh(x); - } - } -} AtanhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::atanh::cuda #endif // __ATANH_CUDA_H__ diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu index cb5a1ff03..55b435920 100644 --- a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "atanh_nvidia.cuh" namespace op::atanh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(atanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atanh::nvidia diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc index a73adcb23..cc9d6131e 100644 --- a/src/infiniop/ops/atanh/operator.cc +++ b/src/infiniop/ops/atanh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/atanh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/atanh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAtanhDescriptor( - infiniopHandle_t handle, - infiniopAtanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::atanh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAtanh( - infiniopAtanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(atanh, Atanh) diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc index 17b3ec888..81ca2fe7a 100644 --- a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc @@ -1,48 +1,8 @@ #include "ceil_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::ceil::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(ceil) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::ceil::cpu diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h index c3ca8e441..423c784cc 100644 --- a/src/infiniop/ops/ceil/cpu/ceil_cpu.h +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h @@ -1,26 +1,9 @@ #ifndef __CEIL_CPU_H__ #define __CEIL_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(ceil, cpu) - -namespace op::ceil::cpu { -typedef struct CeilOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::ceil(x); - } - } -} CeilOp; -} // namespace op::ceil::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(ceil, cpu, op::elementwise::unary::UnaryMode::Ceil) #endif // __CEIL_CPU_H__ diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh index a2d2e7fb5..1d30a42eb 100644 --- a/src/infiniop/ops/ceil/cuda/kernel.cuh +++ b/src/infiniop/ops/ceil/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __CEIL_CUDA_H__ #define __CEIL_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::ceil::cuda { -typedef struct CeilOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2ceil(x); - } else if constexpr (std::is_same_v) { - return hceil(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(ceilf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return ceilf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::ceil(x); - } - } -} CeilOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::ceil::cuda #endif // __CEIL_CUDA_H__ diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu index c7ad2ee5b..88ee35be8 100644 --- a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "ceil_nvidia.cuh" namespace op::ceil::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(ceil) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::ceil::nvidia diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc index 4e5ee7800..dbe591043 100644 --- a/src/infiniop/ops/ceil/operator.cc +++ b/src/infiniop/ops/ceil/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/ceil.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/ceil_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCeilDescriptor( - infiniopHandle_t handle, - infiniopCeilDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::ceil::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCeil( - infiniopCeilDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(ceil, Ceil) diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc index 9dc68d327..19ef002cf 100644 --- a/src/infiniop/ops/cos/cpu/cos_cpu.cc +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -1,48 +1,8 @@ #include "cos_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::cos::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(cos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h index 9b4236fc2..d62aa91b8 100644 --- a/src/infiniop/ops/cos/cpu/cos_cpu.h +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -1,22 +1,9 @@ #ifndef __COS_CPU_H__ #define __COS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(cos, cpu) - -namespace op::cos::cpu { -typedef struct CosOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::cos(x); - } -} CosOp; -} // namespace op::cos::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(cos, cpu, op::elementwise::unary::UnaryMode::Cos) #endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh index b0dabb340..57fe4f50e 100644 --- a/src/infiniop/ops/cos/cuda/kernel.cuh +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __COS_CUDA_H__ #define __COS_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::cos::cuda { -typedef struct CosOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2cos(x); - } else if constexpr (std::is_same_v) { - return hcos(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(cosf(x0), cosf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(cosf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __cosf(x); - } else { - return std::cos(x); - } - } -} CosOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::cos::cuda #endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu index 044c59ca0..5da3c02e8 100644 --- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "cos_nvidia.cuh" namespace op::cos::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(cos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc index 5c464ad60..1531c6caa 100644 --- a/src/infiniop/ops/cos/operator.cc +++ b/src/infiniop/ops/cos/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/cos.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/cos_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCosDescriptor( - infiniopHandle_t handle, - infiniopCosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cos::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCos( - infiniopCosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(cos, Cos) diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc index 9ed8e33da..e7b2a6dad 100644 --- a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc @@ -1,48 +1,8 @@ #include "cosh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::cosh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(cosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cosh::cpu diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h index aea359ef2..c789d38ea 100644 --- a/src/infiniop/ops/cosh/cpu/cosh_cpu.h +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h @@ -1,22 +1,9 @@ #ifndef __COSH_CPU_H__ #define __COSH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(cosh, cpu) - -namespace op::cosh::cpu { -typedef struct CoshOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::cosh(x); - } -} CoshOp; -} // namespace op::cosh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(cosh, cpu, op::elementwise::unary::UnaryMode::Cosh) #endif // __COSH_CPU_H__ diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh index ce6806433..934bfe12d 100644 --- a/src/infiniop/ops/cosh/cuda/kernel.cuh +++ b/src/infiniop/ops/cosh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __COSH_CUDA_H__ #define __COSH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::cosh::cuda { -typedef struct CoshOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(coshf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(coshf(x0), coshf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(coshf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return coshf(x); - } else { - return std::cosh(x); - } - } -} CoshOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::cosh::cuda #endif // __COSH_CUDA_H__ diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu index a5e1442ce..038b0373e 100644 --- a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "cosh_nvidia.cuh" namespace op::cosh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(cosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cosh::nvidia diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc index 75aac0c91..9b18b47ee 100644 --- a/src/infiniop/ops/cosh/operator.cc +++ b/src/infiniop/ops/cosh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/cosh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/cosh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCoshDescriptor( - infiniopHandle_t handle, - infiniopCoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cosh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCosh( - infiniopCoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(cosh, Cosh) diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc index 19e222031..6d150070c 100644 --- a/src/infiniop/ops/div/cpu/div_cpu.cc +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -1,50 +1,8 @@ #include "div_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::div::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(div) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h index 0373b766f..ad76e7ef1 100644 --- a/src/infiniop/ops/div/cpu/div_cpu.h +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -1,19 +1,9 @@ #ifndef __DIV_CPU_H__ #define __DIV_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(div, cpu) - -namespace op::div::cpu { -typedef struct DivOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return a / b; - } -} DivOp; -} // namespace op::div::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, op::elementwise::binary::BinaryMode::Divide) #endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh index a67993da5..f1ab13152 100644 --- a/src/infiniop/ops/div/cuda/kernel.cuh +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __DIV_CUDA_H__ #define __DIV_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::div::cuda { -typedef struct DivOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __h2div(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a / b; - } else if constexpr (std::is_same_v) { - return __fdividef(a, b); - } else { - return a / b; - } - } -} DivOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::div::cuda #endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu index 1abffe816..8aaba09b4 100644 --- a/src/infiniop/ops/div/nvidia/div_nvidia.cu +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "div_nvidia.cuh" namespace op::div::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(div) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc index 84021a1af..af9d1929a 100644 --- a/src/infiniop/ops/div/operator.cc +++ b/src/infiniop/ops/div/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/div.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/div_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/div_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/div_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/div_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/div_moore.h" -#endif - -__C infiniStatus_t infiniopCreateDivDescriptor( - infiniopHandle_t handle, - infiniopDivDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::div::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopDiv( - infiniopDivDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(div, Div) diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc index 00b1897d1..d9119c697 100644 --- a/src/infiniop/ops/erf/cpu/erf_cpu.cc +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -1,48 +1,8 @@ #include "erf_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::erf::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(erf) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h index c26f519cf..f50cd157d 100644 --- a/src/infiniop/ops/erf/cpu/erf_cpu.h +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -1,22 +1,9 @@ #ifndef __ERF_CPU_H__ #define __ERF_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(erf, cpu) - -namespace op::erf::cpu { -typedef struct ErfOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::erf(x); - } -} ErfOp; -} // namespace op::erf::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(erf, cpu, op::elementwise::unary::UnaryMode::Erf) #endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh index 820c10b19..978890cff 100644 --- a/src/infiniop/ops/erf/cuda/kernel.cuh +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ERF_CUDA_H__ #define __ERF_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::erf::cuda { -typedef struct ErfOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(erff(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(erff(x0), erff(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(erff(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return erff(x); - } else { - return std::erf(x); - } - } -} ErfOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::erf::cuda #endif // __ERF_CUDA_H__ diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu index 9080593de..0d743b538 100644 --- a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "erf_nvidia.cuh" namespace op::erf::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(erf) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc index 1491cfa9a..9304cf525 100644 --- a/src/infiniop/ops/erf/operator.cc +++ b/src/infiniop/ops/erf/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/erf.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/erf_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateErfDescriptor( - infiniopHandle_t handle, - infiniopErfDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::erf::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopErf( - infiniopErfDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(erf, Erf) diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc index e809a02e2..cc717ac11 100644 --- a/src/infiniop/ops/floor/cpu/floor_cpu.cc +++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc @@ -1,48 +1,8 @@ #include "floor_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::floor::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(floor) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::floor::cpu diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h index 91508a384..a246309e8 100644 --- a/src/infiniop/ops/floor/cpu/floor_cpu.h +++ b/src/infiniop/ops/floor/cpu/floor_cpu.h @@ -1,26 +1,9 @@ #ifndef __FLOOR_CPU_H__ #define __FLOOR_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(floor, cpu) - -namespace op::floor::cpu { -typedef struct FloorOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::floor(x); - } - } -} FloorOp; -} // namespace op::floor::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(floor, cpu, op::elementwise::unary::UnaryMode::Floor) #endif // __FLOOR_CPU_H__ diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh index c89ce34f4..23a7a44e9 100644 --- a/src/infiniop/ops/floor/cuda/kernel.cuh +++ b/src/infiniop/ops/floor/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __FLOOR_CUDA_H__ #define __FLOOR_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::floor::cuda { -typedef struct FloorOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2floor(x); - } else if constexpr (std::is_same_v) { - return hfloor(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(floorf(x0), floorf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(floorf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return floorf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::floor(x); - } - } -} FloorOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::floor::cuda #endif // __FLOOR_CUDA_H__ diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu index 08305048a..cec304a1c 100644 --- a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "floor_nvidia.cuh" namespace op::floor::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(floor) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::floor::nvidia diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc index 4e4ed2b5a..64e4a586b 100644 --- a/src/infiniop/ops/floor/operator.cc +++ b/src/infiniop/ops/floor/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/floor.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/floor_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateFloorDescriptor( - infiniopHandle_t handle, - infiniopFloorDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::floor::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopFloor( - infiniopFloorDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(floor, Floor) diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc index e7314c319..734ad1617 100644 --- a/src/infiniop/ops/log/cpu/log_cpu.cc +++ b/src/infiniop/ops/log/cpu/log_cpu.cc @@ -1,48 +1,8 @@ #include "log_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::log::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(log) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::log::cpu diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h index 535e681d3..b13d01442 100644 --- a/src/infiniop/ops/log/cpu/log_cpu.h +++ b/src/infiniop/ops/log/cpu/log_cpu.h @@ -1,22 +1,9 @@ #ifndef __LOG_CPU_H__ #define __LOG_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(log, cpu) - -namespace op::log::cpu { -typedef struct LogOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::log(x); - } -} LogOp; -} // namespace op::log::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, op::elementwise::unary::UnaryMode::Log) #endif // __LOG_CPU_H__ diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh index b1e46873c..80980ada1 100644 --- a/src/infiniop/ops/log/cuda/kernel.cuh +++ b/src/infiniop/ops/log/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __LOG_CUDA_H__ #define __LOG_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::log::cuda { -typedef struct LogOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2log(x); - } else if constexpr (std::is_same_v) { - return __float2half(__logf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(logf(x0), logf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(logf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __logf(x); - } else { - return std::log(x); - } - } -} LogOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::log::cuda #endif // __LOG_CUDA_H__ diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu index 9e7bcafc4..87aaa0388 100644 --- a/src/infiniop/ops/log/nvidia/log_nvidia.cu +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "log_nvidia.cuh" namespace op::log::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(log) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::log::nvidia diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc index 8f2add408..9614a0861 100644 --- a/src/infiniop/ops/log/operator.cc +++ b/src/infiniop/ops/log/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/log.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/log_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateLogDescriptor( - infiniopHandle_t handle, - infiniopLogDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::log::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopLog( - infiniopLogDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(log, Log) diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc index 1b30fa4e4..98e8a52a2 100644 --- a/src/infiniop/ops/max/cpu/max_cpu.cc +++ b/src/infiniop/ops/max/cpu/max_cpu.cc @@ -1,50 +1,8 @@ #include "max_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::max::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(max) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::max::cpu diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h index 4d085ed39..2219994d5 100644 --- a/src/infiniop/ops/max/cpu/max_cpu.h +++ b/src/infiniop/ops/max/cpu/max_cpu.h @@ -1,20 +1,9 @@ #ifndef __MAX_CPU_H__ #define __MAX_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -#include -ELEMENTWISE_DESCRIPTOR(max, cpu) - -namespace op::max::cpu { -typedef struct MaxOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::max(a, b); - } -} MaxOp; -} // namespace op::max::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(max, cpu, op::elementwise::binary::BinaryMode::Max) #endif // __MAX_CPU_H__ diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh index bf3977a31..68f634559 100644 --- a/src/infiniop/ops/max/cuda/kernel.cuh +++ b/src/infiniop/ops/max/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __MAX_CUDA_H__ #define __MAX_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::max::cuda { -typedef struct MaxOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __hmax2(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a > b ? a : b; - } else if constexpr (std::is_same_v) { - return fmaxf(a, b); - } else { - return a > b ? a : b; - } - } -} MaxOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::max::cuda #endif // __MAX_CUDA_H__ diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu index 5e9fb13f4..ba4620f3b 100644 --- a/src/infiniop/ops/max/nvidia/max_nvidia.cu +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "max_nvidia.cuh" namespace op::max::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(max) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::max::nvidia diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc index e04368533..3e5299f52 100644 --- a/src/infiniop/ops/max/operator.cc +++ b/src/infiniop/ops/max/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/max.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/max_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/max_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/max_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/max_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/max_moore.h" -#endif - -__C infiniStatus_t infiniopCreateMaxDescriptor( - infiniopHandle_t handle, - infiniopMaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::max::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMax( - infiniopMaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(max, Max) diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc index dc30ee57f..1bac9ea61 100644 --- a/src/infiniop/ops/min/cpu/min_cpu.cc +++ b/src/infiniop/ops/min/cpu/min_cpu.cc @@ -1,50 +1,8 @@ #include "min_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::min::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(min) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::min::cpu diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h index 1c84d4fca..74042db50 100644 --- a/src/infiniop/ops/min/cpu/min_cpu.h +++ b/src/infiniop/ops/min/cpu/min_cpu.h @@ -1,20 +1,9 @@ #ifndef __MIN_CPU_H__ #define __MIN_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -#include -ELEMENTWISE_DESCRIPTOR(min, cpu) - -namespace op::min::cpu { -typedef struct MinOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::min(a, b); - } -} MinOp; -} // namespace op::min::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(min, cpu, op::elementwise::binary::BinaryMode::Min) #endif // __MIN_CPU_H__ diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh index aac14a0e8..75c6ab6b9 100644 --- a/src/infiniop/ops/min/cuda/kernel.cuh +++ b/src/infiniop/ops/min/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __MIN_CUDA_H__ #define __MIN_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::min::cuda { -typedef struct MinOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __hmin2(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a < b ? a : b; - } else if constexpr (std::is_same_v) { - return fminf(a, b); - } else { - return a < b ? a : b; - } - } -} MinOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::min::cuda #endif // __MIN_CUDA_H__ diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu index 419655e29..0708cbcaf 100644 --- a/src/infiniop/ops/min/nvidia/min_nvidia.cu +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "min_nvidia.cuh" namespace op::min::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(min) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::min::nvidia diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc index 8479feab4..6f67ecf87 100644 --- a/src/infiniop/ops/min/operator.cc +++ b/src/infiniop/ops/min/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/min.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/min_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/min_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/min_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/min_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/min_moore.h" -#endif - -__C infiniStatus_t infiniopCreateMinDescriptor( - infiniopHandle_t handle, - infiniopMinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::min::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMin( - infiniopMinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(min, Min) diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc index 907d05166..609c2e76e 100644 --- a/src/infiniop/ops/mod/cpu/mod_cpu.cc +++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc @@ -1,49 +1,8 @@ #include "mod_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::mod::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(mod) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &out_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; -} } // namespace op::mod::cpu diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h index 9e78adca6..72ea7dede 100644 --- a/src/infiniop/ops/mod/cpu/mod_cpu.h +++ b/src/infiniop/ops/mod/cpu/mod_cpu.h @@ -1,23 +1,9 @@ #ifndef __MOD_CPU_H__ #define __MOD_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(mod, cpu) - -namespace op::mod::cpu { -typedef struct ModOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - if constexpr (std::is_floating_point_v) { - return std::fmod(a, b); - } else { - return a % b; - } - } -} ModOp; -} // namespace op::mod::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(mod, cpu, op::elementwise::binary::BinaryMode::Mod) #endif // __MOD_CPU_H__ diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh index 0dcb54136..164784081 100644 --- a/src/infiniop/ops/mod/cuda/kernel.cuh +++ b/src/infiniop/ops/mod/cuda/kernel.cuh @@ -1,30 +1,10 @@ #ifndef __MOD_CUDA_H__ #define __MOD_CUDA_H__ -#include -#include +#include "../../../elementwise/binary.h" namespace op::mod::cuda { -typedef struct ModOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - float2 a_f2 = __half22float2(a); - float2 b_f2 = __half22float2(b); - return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); - } else if constexpr (std::is_same_v) { - float a_ = __half2float(a); - float b_ = __half2float(b); - return __float2half(std::fmod(a_, b_)); - } else if constexpr (std::is_floating_point_v) { - return std::fmod(a, b); - } else { - return a % b; - } - } -} ModOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::mod::cuda #endif // __MOD_CUDA_H__ diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu index 64326d441..68b78ee70 100644 --- a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "mod_nvidia.cuh" namespace op::mod::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(mod) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::mod::nvidia diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc index 85810e794..aef892ce1 100644 --- a/src/infiniop/ops/mod/operator.cc +++ b/src/infiniop/ops/mod/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/mod.h" #ifdef ENABLE_CPU_API @@ -9,134 +8,4 @@ #include "nvidia/mod_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateModDescriptor( - infiniopHandle_t handle, - infiniopModDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::mod::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMod( - infiniopModDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +BINARY_OP_IMPL(mod, Mod) diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc index 5da2ae4c3..47f4d2b2e 100644 --- a/src/infiniop/ops/neg/cpu/neg_cpu.cc +++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc @@ -1,48 +1,8 @@ #include "neg_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::neg::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(neg) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::neg::cpu diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h index ea45989b3..f6778a6d3 100644 --- a/src/infiniop/ops/neg/cpu/neg_cpu.h +++ b/src/infiniop/ops/neg/cpu/neg_cpu.h @@ -2,19 +2,8 @@ #define __NEG_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(neg, cpu) - -namespace op::neg::cpu { -typedef struct NegOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return -x; - } -} NegOp; -} // namespace op::neg::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(neg, cpu, op::elementwise::unary::UnaryMode::Neg) #endif // __NEG_CPU_H__ diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh index 57904b3df..f5cf5a449 100644 --- a/src/infiniop/ops/neg/cuda/kernel.cuh +++ b/src/infiniop/ops/neg/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __NEG_CUDA_H__ #define __NEG_CUDA_H__ -#include +#include "../../../elementwise/unary.h" namespace op::neg::cuda { -typedef struct NegOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __hneg2(x); - } else if constexpr (std::is_same_v) { - return __hneg(x); - } else { - return -x; - } - } -} NegOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::neg::cuda #endif // __NEG_CUDA_H__ diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu index d18b8bf25..f568585f0 100644 --- a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "neg_nvidia.cuh" namespace op::neg::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(neg) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::neg::nvidia diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc index d4134df3e..c3945f4bb 100644 --- a/src/infiniop/ops/neg/operator.cc +++ b/src/infiniop/ops/neg/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/neg.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/neg_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateNegDescriptor( - infiniopHandle_t handle, - infiniopNegDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::neg::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopNeg( - infiniopNegDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(neg, Neg) diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc index 0c6fda0f7..1134d8aae 100644 --- a/src/infiniop/ops/pow/cpu/pow_cpu.cc +++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc @@ -1,49 +1,8 @@ #include "pow_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::pow::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(pow) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &out_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; -} } // namespace op::pow::cpu diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h index 21d9bb897..9c8e8a368 100644 --- a/src/infiniop/ops/pow/cpu/pow_cpu.h +++ b/src/infiniop/ops/pow/cpu/pow_cpu.h @@ -1,19 +1,9 @@ #ifndef __POW_CPU_H__ #define __POW_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(pow, cpu) - -namespace op::pow::cpu { -typedef struct PowOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::pow(a, b); - } -} PowOp; -} // namespace op::pow::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow) #endif // __POW_CPU_H__ diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh index 3786e7a52..0637240e8 100644 --- a/src/infiniop/ops/pow/cuda/kernel.cuh +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -1,40 +1,10 @@ #ifndef __POW_CUDA_H__ #define __POW_CUDA_H__ -#include -#include -#include +#include "../../../elementwise/binary.h" namespace op::pow::cuda { -typedef struct PowOp { - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - float2 a_f2 = __half22float2(a); - float2 b_f2 = __half22float2(b); - return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); - } else if constexpr (std::is_same_v) { - float a_ = __half2float(a); - float b_ = __half2float(b); - float ans_f = __powf(a_, b_); - return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); - } else if constexpr (std::is_same_v) { - float2 a_f2 = __bfloat1622float2(a); - float2 b_f2 = __bfloat1622float2(b); - return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); - } else if constexpr (std::is_same_v) { - float a_ = __bfloat162float(a); - float b_ = __bfloat162float(b); - return __float2bfloat16_rn(__powf(a_, b_)); - } else if constexpr (std::is_same_v) { - return __powf(a, b); - } else { - return std::pow(a, b); - } - } -} PowOp; - +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::pow::cuda #endif // __POW_CUDA_H__ diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu index 3cfd0cd2f..63a3d40a3 100644 --- a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "pow_nvidia.cuh" namespace op::pow::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::pow::nvidia diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc index e90639f67..b1ddbc9c1 100644 --- a/src/infiniop/ops/pow/operator.cc +++ b/src/infiniop/ops/pow/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/pow.h" #ifdef ENABLE_CPU_API @@ -9,134 +8,4 @@ #include "nvidia/pow_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreatePowDescriptor( - infiniopHandle_t handle, - infiniopPowDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::pow::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopPow( - infiniopPowDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +BINARY_OP_IMPL(pow, Pow) diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc index 52874c8b3..0b66eca64 100644 --- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc @@ -1,48 +1,8 @@ #include "reciprocal_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::reciprocal::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(reciprocal) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::reciprocal::cpu diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h index 0a0f223f0..9af583ab7 100644 --- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h @@ -2,19 +2,8 @@ #define __RECIPROCAL_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(reciprocal, cpu) - -namespace op::reciprocal::cpu { -typedef struct ReciprocalOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return T(1) / x; - } -} ReciprocalOp; -} // namespace op::reciprocal::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(reciprocal, cpu, op::elementwise::unary::UnaryMode::Reciprocal) #endif // __RECIPROCAL_CPU_H__ diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh index 94c71de90..8c29a8e9e 100644 --- a/src/infiniop/ops/reciprocal/cuda/kernel.cuh +++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __RECIPROCAL_CUDA_H__ #define __RECIPROCAL_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::reciprocal::cuda { -typedef struct ReciprocalOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2rcp(x); - } else if constexpr (std::is_same_v) { - return hrcp(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __frcp_rn(x); - } else { - return T(1) / x; - } - } -} ReciprocalOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::reciprocal::cuda #endif // __RECIPROCAL_CUDA_H__ diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu index 45b74e25e..39a41b583 100644 --- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "reciprocal_nvidia.cuh" namespace op::reciprocal::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(reciprocal) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::reciprocal::nvidia diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc index 033286024..966bd72d8 100644 --- a/src/infiniop/ops/reciprocal/operator.cc +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/reciprocal.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/reciprocal_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateReciprocalDescriptor( - infiniopHandle_t handle, - infiniopReciprocalDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::reciprocal::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopReciprocal( - infiniopReciprocalDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(reciprocal, Reciprocal) diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc index 0b0cea7b7..20ae304bd 100644 --- a/src/infiniop/ops/round/cpu/round_cpu.cc +++ b/src/infiniop/ops/round/cpu/round_cpu.cc @@ -1,48 +1,8 @@ #include "round_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::round::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(round) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::round::cpu diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h index eccd6df0f..1a755dbf8 100644 --- a/src/infiniop/ops/round/cpu/round_cpu.h +++ b/src/infiniop/ops/round/cpu/round_cpu.h @@ -2,24 +2,8 @@ #define __ROUND_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" -#include +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(round, cpu) - -namespace op::round::cpu { -typedef struct RoundOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::nearbyint(x); - } - } -} RoundOp; -} // namespace op::round::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(round, cpu, op::elementwise::unary::UnaryMode::Round) #endif // __ROUND_CPU_H__ diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh index c52a10716..f4de9c772 100644 --- a/src/infiniop/ops/round/cuda/kernel.cuh +++ b/src/infiniop/ops/round/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __ROUND_CUDA_H__ #define __ROUND_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::round::cuda { -typedef struct RoundOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2rint(x); - } else if constexpr (std::is_same_v) { - return hrint(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(rintf(x0), rintf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(rintf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return rintf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::nearbyint(x); - } - } -} RoundOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::round::cuda #endif // __ROUND_CUDA_H__ diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu index c1fabc885..dc84388a3 100644 --- a/src/infiniop/ops/round/nvidia/round_nvidia.cu +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "round_nvidia.cuh" namespace op::round::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(round) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::round::nvidia diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc index 9468803c8..a20fbcb17 100644 --- a/src/infiniop/ops/round/operator.cc +++ b/src/infiniop/ops/round/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/round.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/round_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateRoundDescriptor( - infiniopHandle_t handle, - infiniopRoundDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::round::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopRound( - infiniopRoundDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(round, Round) diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc index 1f3430e73..c65868d09 100644 --- a/src/infiniop/ops/sign/cpu/sign_cpu.cc +++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc @@ -1,48 +1,8 @@ #include "sign_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sign::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sign) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sign::cpu diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h index 505194c85..7ddeec543 100644 --- a/src/infiniop/ops/sign/cpu/sign_cpu.h +++ b/src/infiniop/ops/sign/cpu/sign_cpu.h @@ -2,19 +2,8 @@ #define __SIGN_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sign, cpu) - -namespace op::sign::cpu { -typedef struct SignOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); - } -} SignOp; -} // namespace op::sign::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sign, cpu, op::elementwise::unary::UnaryMode::Sign) #endif // __SIGN_CPU_H__ diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh index 3737282b0..a1216fb82 100644 --- a/src/infiniop/ops/sign/cuda/kernel.cuh +++ b/src/infiniop/ops/sign/cuda/kernel.cuh @@ -1,25 +1,10 @@ #ifndef __SIGN_CUDA_H__ #define __SIGN_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::sign::cuda { -typedef struct SignOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); - return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); - } else if constexpr (std::is_same_v) { - return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); - } else { - return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); - } - } -} SignOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sign::cuda #endif // __SIGN_CUDA_H__ diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu index 6a3152e41..2a11f9e23 100644 --- a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sign_nvidia.cuh" namespace op::sign::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sign) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sign::nvidia diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc index 8f658a9b3..1a4599d5d 100644 --- a/src/infiniop/ops/sign/operator.cc +++ b/src/infiniop/ops/sign/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sign.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sign_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSignDescriptor( - infiniopHandle_t handle, - infiniopSignDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sign::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSign( - infiniopSignDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sign, Sign) diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc index 40685847d..897439905 100644 --- a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc @@ -1,48 +1,8 @@ #include "sinh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sinh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sinh::cpu diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h index dbc8f3c7e..573027ee3 100644 --- a/src/infiniop/ops/sinh/cpu/sinh_cpu.h +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h @@ -1,22 +1,9 @@ #ifndef __SINH_CPU_H__ #define __SINH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sinh, cpu) - -namespace op::sinh::cpu { -typedef struct SinhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::sinh(x); - } -} SinhOp; -} // namespace op::sinh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sinh, cpu, op::elementwise::unary::UnaryMode::Sinh) #endif // __SINH_CPU_H__ diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh index c09150666..d5bb7491f 100644 --- a/src/infiniop/ops/sinh/cuda/kernel.cuh +++ b/src/infiniop/ops/sinh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __SINH_CUDA_H__ #define __SINH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::sinh::cuda { -typedef struct SinhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(sinhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(sinhf(x0), sinhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(sinhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return sinhf(x); - } else { - return std::sinh(x); - } - } -} SinhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sinh::cuda #endif // __SINH_CUDA_H__ diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu index d4c3fd165..3abfc2973 100644 --- a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sinh_nvidia.cuh" namespace op::sinh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sinh::nvidia diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc index 1636ce2c8..41940d235 100644 --- a/src/infiniop/ops/sinh/operator.cc +++ b/src/infiniop/ops/sinh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sinh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sinh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSinhDescriptor( - infiniopHandle_t handle, - infiniopSinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sinh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSinh( - infiniopSinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sinh, Sinh) diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc index 99e723126..eb9ac4d66 100644 --- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -1,48 +1,8 @@ #include "sqrt_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sqrt::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sqrt) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sqrt::cpu diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h index 3d026cf63..ed6217e1f 100644 --- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h @@ -1,22 +1,9 @@ #ifndef __SQRT_CPU_H__ #define __SQRT_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sqrt, cpu) - -namespace op::sqrt::cpu { -typedef struct SqrtOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::sqrt(x); - } -} SqrtOp; -} // namespace op::sqrt::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sqrt, cpu, op::elementwise::unary::UnaryMode::Sqrt) #endif // __SQRT_CPU_H__ diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh index c82cd7dd5..40ab9708f 100644 --- a/src/infiniop/ops/sqrt/cuda/kernel.cuh +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __SQRT_CUDA_H__ #define __SQRT_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::sqrt::cuda { -typedef struct SqrtOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2sqrt(x); - } else if constexpr (std::is_same_v) { - return hsqrt(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __fsqrt_rn(x); - } else { - return std::sqrt(x); - } - } -} SqrtOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sqrt::cuda #endif // __SQRT_CUDA_H__ diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu index 519d06e89..4d6c70d72 100644 --- a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sqrt_nvidia.cuh" namespace op::sqrt::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sqrt::nvidia diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc index b11c8a4b5..fe999f58f 100644 --- a/src/infiniop/ops/sqrt/operator.cc +++ b/src/infiniop/ops/sqrt/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sqrt.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sqrt_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSqrtDescriptor( - infiniopHandle_t handle, - infiniopSqrtDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sqrt::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSqrt( - infiniopSqrtDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sqrt, Sqrt) diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc index 2947dfc5e..5166cf64f 100644 --- a/src/infiniop/ops/tan/cpu/tan_cpu.cc +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -1,48 +1,8 @@ #include "tan_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::tan::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(tan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h index c3a22456c..6c697c311 100644 --- a/src/infiniop/ops/tan/cpu/tan_cpu.h +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -1,22 +1,9 @@ #ifndef __TAN_CPU_H__ #define __TAN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(tan, cpu) - -namespace op::tan::cpu { -typedef struct TanOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::tan(x); - } -} TanOp; -} // namespace op::tan::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(tan, cpu, op::elementwise::unary::UnaryMode::Tan) #endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh index bbd8facaa..c3cf45350 100644 --- a/src/infiniop/ops/tan/cuda/kernel.cuh +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -1,55 +1,10 @@ #ifndef __TAN_CUDA_H__ #define __TAN_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include -#include - -#define TAN_THRESHOLD 15000 +#include "../../../elementwise/unary.h" namespace op::tan::cuda { -typedef struct TanOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2sin(x) / h2cos(x); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(__half2float(x)); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return __float2half(tanf(__half2float(x))); - } - return __float2half(tan_f); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - float tan_f0 = __tanf(x0); - float tan_f1 = __tanf(x1); - if (std::fabs(tan_f0) > TAN_THRESHOLD) { - tan_f0 = tanf(x0); - } - if (std::fabs(tan_f1) > TAN_THRESHOLD) { - tan_f1 = tanf(x1); - } - return __floats2bfloat162_rn(tan_f0, tan_f1); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(__bfloat162float(x)); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return __float2bfloat16_rn(tanf(__bfloat162float(x))); - } - return __float2bfloat16_rn(tan_f); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(x); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return tanf(x); - } - return tan_f; - } else { - return std::tan(x); - } - } -} TanOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::tan::cuda #endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu index b4c24e2fe..5f56dcb6f 100644 --- a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "tan_nvidia.cuh" namespace op::tan::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(tan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index 48ae8d48e..ae506dcd8 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/tan.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/tan_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateTanDescriptor( - infiniopHandle_t handle, - infiniopTanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::tan::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopTan( - infiniopTanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(tan, Tan) diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh index e336a4995..d987ac7c5 100644 --- a/src/infiniop/ops/tanh/cuda/kernel.cuh +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -1,44 +1,10 @@ #ifndef __TANH_CUDA_H__ #define __TANH_CUDA_H__ -#include +#include "../../../elementwise/unary.h" namespace op::tanh::cuda { -typedef struct TanhOp { - static constexpr size_t num_inputs = 1; - - __device__ __forceinline__ float tanh_f32_func(float x) const { - return tanhf(x); - } - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = tanh_f32_func(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - float r0 = tanh_f32_func(f0); - float r1 = tanh_f32_func(f1); - return __floats2bfloat162_rn(r0, r1); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - float rf = tanh_f32_func(xf); - return __float2bfloat16_rn(rf); - } else if constexpr (std::is_same_v) { - return tanh_f32_func(input); - } else if constexpr (std::is_same_v) { - return std::tanh(input); - } else { - return std::tanh(input); - } - } -} TanhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::tanh::cuda #endif // __TANH_CUDA_H__ diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu index a2c36551c..62f02da67 100644 --- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -1,59 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "tanh_nvidia.cuh" namespace op::tanh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(tanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tanh::nvidia