From d42061d2fb9e3707b399c271883aeb8c0f409bdd Mon Sep 17 00:00:00 2001 From: suss <1152623206@qq.com> Date: Fri, 16 Jan 2026 00:06:23 +0800 Subject: [PATCH] Add quickgelu/gelutanh ops needed by KV compression --- include/infiniop.h | 2 + include/infiniop/ops/gelutanh.h | 43 ++++++ include/infiniop/ops/quickgelu.h | 42 +++++ src/infiniop/ops/add/operator.cc | 14 +- src/infiniop/ops/conv/operator.cc | 14 +- src/infiniop/ops/gelu/operator.cc | 14 +- src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc | 53 +++++++ src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h | 27 ++++ src/infiniop/ops/gelutanh/cuda/kernel.cuh | 59 +++++++ .../ops/gelutanh/nvidia/gelutanh_nvidia.cu | 71 +++++++++ .../ops/gelutanh/nvidia/gelutanh_nvidia.cuh | 9 ++ src/infiniop/ops/gelutanh/operator.cc | 144 ++++++++++++++++++ src/infiniop/ops/layer_norm/operator.cc | 14 +- .../ops/quickgelu/cpu/quickgelu_cpu.cc | 53 +++++++ .../ops/quickgelu/cpu/quickgelu_cpu.h | 26 ++++ src/infiniop/ops/quickgelu/cuda/kernel.cuh | 61 ++++++++ .../ops/quickgelu/nvidia/quickgelu_nvidia.cu | 71 +++++++++ .../ops/quickgelu/nvidia/quickgelu_nvidia.cuh | 9 ++ src/infiniop/ops/quickgelu/operator.cc | 144 ++++++++++++++++++ src/infiniop/ops/relu/operator.cc | 14 +- src/infiniop/ops/sigmoid/operator.cc | 14 +- src/infiniop/ops/tanh/operator.cc | 14 +- xmake/hygon.lua | 10 ++ xmake/nvidia.lua | 8 +- 24 files changed, 919 insertions(+), 11 deletions(-) create mode 100644 include/infiniop/ops/gelutanh.h create mode 100644 include/infiniop/ops/quickgelu.h create mode 100644 src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc create mode 100644 src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h create mode 100644 src/infiniop/ops/gelutanh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu create mode 100644 src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh create mode 100644 src/infiniop/ops/gelutanh/operator.cc create mode 100644 src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc create mode 100644 src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h create mode 100644 src/infiniop/ops/quickgelu/cuda/kernel.cuh create mode 100644 src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu create mode 100644 src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh create mode 100644 src/infiniop/ops/quickgelu/operator.cc diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..f0d682863 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -10,6 +10,7 @@ #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/gelu.h" +#include "infiniop/ops/gelutanh.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" @@ -20,6 +21,7 @@ #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/random_sample.h" +#include "infiniop/ops/quickgelu.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" diff --git a/include/infiniop/ops/gelutanh.h b/include/infiniop/ops/gelutanh.h new file mode 100644 index 000000000..5ff6dad23 --- /dev/null +++ b/include/infiniop/ops/gelutanh.h @@ -0,0 +1,43 @@ +#ifndef __INFINIOP_GELUTANH_API_H__ +#define __INFINIOP_GELUTANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluTanhDescriptor_t; + +/** + * Create GELU-Tanh descriptor + * + * y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + */ +__C __export infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__C __export infiniStatus_t infiniopGetGeluTanhWorkspaceSize( + infiniopGeluTanhDescriptor_t desc, + size_t *size); + +/** + * Launch GELU-Tanh operator + */ +__C __export infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__C __export infiniStatus_t infiniopDestroyGeluTanhDescriptor( + infiniopGeluTanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/quickgelu.h b/include/infiniop/ops/quickgelu.h new file mode 100644 index 000000000..1ea19ccf1 --- /dev/null +++ b/include/infiniop/ops/quickgelu.h @@ -0,0 +1,42 @@ +#ifndef __INFINIOP_QUICKGELU_API_H__ +#define __INFINIOP_QUICKGELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopQuickGeluDescriptor_t; + +/** + * Create QuickGELU descriptor + * y = x * sigmoid(1.702 * x) + */ +__C __export infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__C __export infiniStatus_t infiniopGetQuickGeluWorkspaceSize( + infiniopQuickGeluDescriptor_t desc, + size_t *size); + +/** + * Launch QuickGELU operator + */ +__C __export infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__C __export infiniStatus_t infiniopDestroyQuickGeluDescriptor( + infiniopQuickGeluDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/add/operator.cc b/src/infiniop/ops/add/operator.cc index eba226421..87528c4b3 100644 --- a/src/infiniop/ops/add/operator.cc +++ b/src/infiniop/ops/add/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/add_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/add_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -91,6 +94,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -139,6 +145,9 @@ __C infiniStatus_t infiniopAdd( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -181,6 +190,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc index 4c974febc..ed30767be 100644 --- a/src/infiniop/ops/conv/operator.cc +++ b/src/infiniop/ops/conv/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/conv_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/conv_nvidia.cuh" #endif @@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize( #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc index 262808ff0..0d2ff00f5 100644 --- a/src/infiniop/ops/gelu/operator.cc +++ b/src/infiniop/ops/gelu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/gelu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/gelu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -43,6 +43,9 @@ __C infiniStatus_t infiniopCreateGeluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -77,6 +80,9 @@ __C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, s #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -118,6 +124,9 @@ __C infiniStatus_t infiniopGelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -154,6 +163,9 @@ infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc new file mode 100644 index 000000000..6d7631e91 --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc @@ -0,0 +1,53 @@ +#include "gelutanh_cpu.h" + +namespace op::gelutanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::gelutanh::cpu + diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h new file mode 100644 index 000000000..713a2d0c5 --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h @@ -0,0 +1,27 @@ +#ifndef __GELUTANH_CPU_H__ +#define __GELUTANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(gelutanh, cpu) + +namespace op::gelutanh::cpu { +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + constexpr T alpha = static_cast(0.7978845608); // sqrt(2/pi) + constexpr T beta = static_cast(0.044715); + T inner = alpha * (x + beta * x * x * x); + return x * static_cast(0.5) * (static_cast(1) + std::tanh(inner)); + } +} GeluTanhOp; +} // namespace op::gelutanh::cpu + +#endif // __GELUTANH_CPU_H__ + diff --git a/src/infiniop/ops/gelutanh/cuda/kernel.cuh b/src/infiniop/ops/gelutanh/cuda/kernel.cuh new file mode 100644 index 000000000..d52c344fb --- /dev/null +++ b/src/infiniop/ops/gelutanh/cuda/kernel.cuh @@ -0,0 +1,59 @@ +#ifndef __GELUTANH_CUDA_H__ +#define __GELUTANH_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include +#include + +namespace op::gelutanh::cuda { + +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + // GELU-Tanh constants + // static constexpr float alpha = std::sqrt(2.0 / M_PI); + // static constexpr float beta = 0.044715f; + static constexpr float alpha = 0.7978845608f; // sqrt(2/pi) + static constexpr float beta = 0.044715f; + // f32 tanh helper + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2 -> float2 + float2 vf = __half22float2(x); + float inner_x0 = alpha * (vf.x + beta * vf.x * vf.x * vf.x); + float inner_x1 = alpha * (vf.y + beta * vf.y * vf.y * vf.y); + float2 vr = make_float2(tanh_f32_func(inner_x0) * 0.5f + 0.5f, + tanh_f32_func(inner_x1) * 0.5f + 0.5f); + return __hmul2(x, __float22half2_rn(vr)); // y = x * 0.5 * (1 + tanh(...)) + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2bfloat16(yf); + } else if constexpr (std::is_same_v) { + float inner = alpha * (x + beta * x * x * x); + return x * 0.5f * (1.0f + tanh_f32_func(inner)); + } else { // double + double inner = alpha * (x + beta * x * x * x); + return x * 0.5 * (1.0 + std::tanh(inner)); + } + } + +} GeluTanhOp; + +} // namespace op::gelutanh::cuda + +#endif // __GELUTANH_CUDA_H__ + diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu new file mode 100644 index 000000000..00f6cebcb --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu @@ -0,0 +1,71 @@ +#include "../cuda/kernel.cuh" +#include "gelutanh_nvidia.cuh" + +namespace op::gelutanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluTanhOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluTanhOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluTanhOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluTanhOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gelutanh::nvidia + diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh new file mode 100644 index 000000000..e8d8d8c31 --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __GELUTANH_CUDA_API_H__ +#define __GELUTANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelutanh, nvidia) + +#endif // __GELUTANH_CUDA_API_H__ + diff --git a/src/infiniop/ops/gelutanh/operator.cc b/src/infiniop/ops/gelutanh/operator.cc new file mode 100644 index 000000000..f329f667e --- /dev/null +++ b/src/infiniop/ops/gelutanh/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gelutanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelutanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/gelutanh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelutanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGeluTanhWorkspaceSize(infiniopGeluTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyGeluTanhDescriptor(infiniopGeluTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/layer_norm/operator.cc b/src/infiniop/ops/layer_norm/operator.cc index 3dbbdcb21..594f942cd 100644 --- a/src/infiniop/ops/layer_norm/operator.cc +++ b/src/infiniop/ops/layer_norm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/layer_norm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/layer_norm_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -49,6 +49,9 @@ __C infiniStatus_t infiniopCreateLayerNormDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -79,6 +82,9 @@ __C infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -129,6 +135,9 @@ __C infiniStatus_t infiniopLayerNorm( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -159,6 +168,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc new file mode 100644 index 000000000..dfb64da0a --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc @@ -0,0 +1,53 @@ +#include "quickgelu_cpu.h" + +namespace op::quickgelu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::quickgelu::cpu + diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h new file mode 100644 index 000000000..3418bdd25 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h @@ -0,0 +1,26 @@ +#ifndef __QUICKGELU_CPU_H__ +#define __QUICKGELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(quickgelu, cpu) + +namespace op::quickgelu::cpu { +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + constexpr T alpha = static_cast(1.702); + T ax = alpha * x; + return x / (static_cast(1) + std::exp(-ax)); + } +} QuickGeluOp; +} // namespace op::quickgelu::cpu + +#endif // __QUICKGELU_CPU_H__ + diff --git a/src/infiniop/ops/quickgelu/cuda/kernel.cuh b/src/infiniop/ops/quickgelu/cuda/kernel.cuh new file mode 100644 index 000000000..5d678a350 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cuda/kernel.cuh @@ -0,0 +1,61 @@ +#ifndef __QUICKGELU_CUDA_H__ +#define __QUICKGELU_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +namespace op::quickgelu::cuda { + +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + + constexpr float alpha = 1.702f; + + if constexpr (std::is_same_v) { + half2 ax = __hmul2(make_half2(alpha, alpha), x); + half2 denominator = __hadd2(make_half2(1, 1), h2exp(__hneg2(ax))); + half2 sigmoid = h2rcp(denominator); + return __hmul2(x, sigmoid); + + } else if constexpr (std::is_same_v) { + half ax = __hmul(__float2half(alpha), x); + half denominator = __hadd(__float2half(1.0f), hexp(__hneg(ax))); + half sigmoid = hrcp(denominator); + return __hmul(x, sigmoid); + + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float ax = alpha * xf; + float s = 1.0f / (1.0f + __expf(-ax)); + return __float2bfloat16(xf * s); + + } else if constexpr (std::is_same_v) { + float ax = alpha * x; + float s; + if (ax >= 0.0f) { + float z = expf(-ax); + s = 1.0f / (1.0f + z); + } else { + float z = expf(ax); + s = z / (1.0f + z); + } + return x * s; + + } else { // double + double ax = static_cast(alpha) * x; + return x / (1.0 + exp(-ax)); + } + } + +} QuickGeluOp; + +} // namespace op::quickgelu::cuda + +#endif // __QUICKGELU_CUDA_H__ + diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu new file mode 100644 index 000000000..e4bcae1a7 --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu @@ -0,0 +1,71 @@ +#include "../cuda/kernel.cuh" +#include "quickgelu_nvidia.cuh" + +namespace op::quickgelu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::QuickGeluOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::QuickGeluOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::QuickGeluOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::QuickGeluOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::quickgelu::nvidia + diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh new file mode 100644 index 000000000..935c86758 --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __QUICKGELU_CUDA_API_H__ +#define __QUICKGELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(quickgelu, nvidia) + +#endif // __QUICKGELU_CUDA_API_H__ + diff --git a/src/infiniop/ops/quickgelu/operator.cc b/src/infiniop/ops/quickgelu/operator.cc new file mode 100644 index 000000000..158e21cf3 --- /dev/null +++ b/src/infiniop/ops/quickgelu/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/quickgelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/quickgelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/quickgelu_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::quickgelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetQuickGeluWorkspaceSize(infiniopQuickGeluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyQuickGeluDescriptor(infiniopQuickGeluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/relu/operator.cc b/src/infiniop/ops/relu/operator.cc index 093674de6..8992ca56b 100644 --- a/src/infiniop/ops/relu/operator.cc +++ b/src/infiniop/ops/relu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/relu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/relu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateReluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CREATE(INFINI_DEVICE_METAX, metax); @@ -75,6 +78,9 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia) #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED GET(INFINI_DEVICE_METAX, metax) @@ -115,6 +121,9 @@ __C infiniStatus_t infiniopRelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CALCULATE(INFINI_DEVICE_METAX, metax); @@ -150,6 +159,9 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED DELETE(INFINI_DEVICE_METAX, metax); diff --git a/src/infiniop/ops/sigmoid/operator.cc b/src/infiniop/ops/sigmoid/operator.cc index c86fc91d6..7bab01a49 100644 --- a/src/infiniop/ops/sigmoid/operator.cc +++ b/src/infiniop/ops/sigmoid/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/sigmoid_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/sigmoid_nvidia.cuh" #endif @@ -34,6 +34,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -58,6 +61,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d #endif #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -91,6 +97,9 @@ __C infiniStatus_t infiniopSigmoid( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -118,6 +127,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc index 7dcc9b303..a727f2084 100644 --- a/src/infiniop/ops/tanh/operator.cc +++ b/src/infiniop/ops/tanh/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/tanh_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/tanh_nvidia.cuh" #endif // #ifdef ENABLE_METAX_API @@ -39,6 +39,9 @@ __C infiniStatus_t infiniopCreateTanhDescriptor( #endif #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax); @@ -70,6 +73,9 @@ __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, s #endif #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax); @@ -108,6 +114,9 @@ __C infiniStatus_t infiniopTanh( #endif #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax); @@ -141,6 +150,9 @@ infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { #endif #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // DELETE(INFINI_DEVICE_METAX, metax); diff --git a/xmake/hygon.lua b/xmake/hygon.lua index ed4b91f0e..a5dc646c9 100644 --- a/xmake/hygon.lua +++ b/xmake/hygon.lua @@ -74,6 +74,16 @@ target("infiniop-hygon") add_files("../src/infiniop/ops/rearrange/nvidia/*.cu") add_files("../src/infiniop/ops/rms_norm/nvidia/*.cu") add_files("../src/infiniop/ops/swiglu/nvidia/*.cu") + add_files("../src/infiniop/ops/conv/nvidia/*.cu") + add_files("../src/infiniop/ops/add/nvidia/*.cu") + add_files("../src/infiniop/ops/layer_norm/nvidia/*.cu") + add_files("../src/infiniop/ops/relu/nvidia/*.cu") + add_files("../src/infiniop/ops/softmax/nvidia/*.cu") + add_files("../src/infiniop/ops/sigmoid/nvidia/*.cu") + add_files("../src/infiniop/ops/gelu/nvidia/*.cu") + add_files("../src/infiniop/ops/tanh/nvidia/*.cu") + add_files("../src/infiniop/ops/quickgelu/nvidia/*.cu") + add_files("../src/infiniop/ops/gelutanh/nvidia/*.cu") if has_config("ninetoothed") then add_files("../build/ninetoothed/*.c", {cxflags = {"-Wno-return-type"}}) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index a86090776..5e292170b 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -45,9 +45,9 @@ target("infiniop-nvidia") end else add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror") - add_cuflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) add_cuflags("--extended-lambda") - add_culdflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxxflags("-fPIC") add_cuflags("--expt-relaxed-constexpr") if CUDNN_ROOT ~= nil then @@ -89,8 +89,8 @@ target("infinirt-nvidia") add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") add_cxxflags("/FS") else - add_cuflags("-Xcompiler=-fPIC") - add_culdflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxflags("-fPIC") end