Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/dequantize_gptq.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/layer_norm.h"
Expand Down
30 changes: 30 additions & 0 deletions include/infiniop/ops/dequantize_gptq.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef __INFINIOP_DEQUANTIZE_GPTQ_API_H__
#define __INFINIOP_DEQUANTIZE_GPTQ_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;

__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
infiniopDequantizeGPTQDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t qweight_desc,
infiniopTensorDescriptor_t scales_desc,
infiniopTensorDescriptor_t zeros_desc,
infiniopTensorDescriptor_t g_idx_desc); // add g_idx

__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *qweight,
const void *scales,
const void *zeros,
const void *g_idx, // add g_idx
void *stream);

__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);

#endif
3 changes: 2 additions & 1 deletion scripts/python_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def run_tests(args):
"causal_softmax.py",
"clip.py",
"conv.py",
#"dequantize_awq.py",
"dequantize_awq.py",
"dequantize_gptq.py",
"gelu.py",
"gemm.py",
#"layer_norm.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <cuda_fp16.h>

__global__ void __launch_bounds__(64)
dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int G) {
// static constexpr uint32_t ZERO = 0x0;
half B_shared[32 * (128 + 8)];
Expand All @@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
half *scaling_factors_ptr2 = scaling_factors + index4;

uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);

uint32_t B_loaded = *(uint32_t *)B_ptr2;
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);

// Reinterpret uint4 components as __half2
__half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
Expand Down Expand Up @@ -119,7 +119,7 @@ Descriptor::calculate(
half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));

dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
qweight_, scales_, zeros_, out_, group_size);
return INFINI_STATUS_SUCCESS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* @param source 输入的32位无符号整数,它打包了8个4-bit的数据。
* @return 一个 uint4 变量,其中包含8个反量化后的 half 值。
*/
__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
// 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
// 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0],
// 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* @param source 输入的32位无符号整数,它打包了8个4-bit的数据。
* @return 一个 uint4 变量,其中包含8个反量化后的 half 值。
*/
__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
// 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
// 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0],
// 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <musa_fp16.h>

__global__ void __launch_bounds__(64)
dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int G) {
// static constexpr uint32_t ZERO = 0x0;
half B_shared[32 * (128 + 8)];
Expand All @@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
half *scaling_factors_ptr2 = scaling_factors + index4;

uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);

uint32_t B_loaded = *(uint32_t *)B_ptr2;
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);

// Reinterpret uint4 components as __half2
__half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
Expand Down Expand Up @@ -119,7 +119,7 @@ Descriptor::calculate(
half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));

dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
qweight_, scales_, zeros_, out_, group_size);
return INFINI_STATUS_SUCCESS;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
__device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
// 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
// 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0],
Expand Down
71 changes: 38 additions & 33 deletions src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,36 @@

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
__global__ void __launch_bounds__(64)
dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int G) {
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int G,
int out_features, int in_features) {
// static constexpr uint32_t ZERO = 0x0;
half B_shared[32 * (128 + 8)];

half *B_shared_ptr2 = B_shared;

int N = blockDim.x * gridDim.x; // 2
int col = (blockIdx.x * blockDim.x + threadIdx.x);
int row = (blockIdx.y * blockDim.y + threadIdx.y);
int index1 = 8 * col + 8 * row * N;

// 边界检查,防止越界访问
if (col >= out_features || row >= in_features) return;

// 每个元素在输出中的起始地址:行主序,连续 8 个 half
int index1 = 8 * col + 8 * row * out_features;
half *C_ptr2 = C + index1;

int index2 = col + row * N;
int index2 = col + row * out_features;
int *B_ptr2 = B + index2;

int index3 = col + (int)(row / G) * N;
int index3 = col + (int)(row / G) * out_features;
int *zeros_ptr2 = zeros + index3;
int index4 = 8 * col + (int)(row / G) * N * 8;

int index4 = 8 * col + (int)(row / G) * out_features * 8;
half *scaling_factors_ptr2 = scaling_factors + index4;

uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);

uint32_t B_loaded = *(uint32_t *)B_ptr2;
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);

// Reinterpret uint4 components as __half2
__half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
Expand All @@ -55,42 +58,43 @@ __global__ void __launch_bounds__(64)
B_loaded_fp16_h2[2] = __hfma2(B_loaded_fp16_h2[2], B_loaded_scale_h2[2], __float2half2_rn(0.0f));
B_loaded_fp16_h2[3] = __hfma2(B_loaded_fp16_h2[3], B_loaded_scale_h2[3], __float2half2_rn(0.0f));

// Store back to shared memory
*(uint4 *)B_shared_ptr2 = B_loaded_fp16;

// 直接写回全局内存输出
half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
#pragma unroll
for (int i = 0; i < 8; ++i) {
*(C_ptr2 + i) = B_shared[i];
C_ptr2[i] = out_vec[i];
}
}
#else
__global__ void __launch_bounds__(64)
dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int group_size) {
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
int *__restrict__ zeros, half *__restrict__ C, int group_size,
int out_features, int in_features) {
static constexpr uint32_t ZERO = 0x0;
half B_shared[32 * (128 + 8)];

half *B_shared_ptr2 = B_shared;

int N = blockDim.x * gridDim.x; // 2
int col = (blockIdx.x * blockDim.x + threadIdx.x);
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index1 = 8 * col + 8 * row * N;

// 边界检查,防止越界访问
if (col >= out_features || row >= in_features) return;

int index1 = 8 * col + 8 * row * out_features;
half *C_ptr2 = C + index1;

int index2 = col + row * N;
int index2 = col + row * out_features;
int *B_ptr2 = B + index2;

int index3 = col + (int)(row / group_size) * N;
int index3 = col + (int)(row / group_size) * out_features;
int *zeros_ptr2 = zeros + index3;
int index4 = 8 * col + (int)(row / group_size) * N * 8;
int index4 = 8 * col + (int)(row / group_size) * out_features * 8;
half *scaling_factors_ptr2 = scaling_factors + index4;

uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);

uint32_t B_loaded = *(uint32_t *)B_ptr2;
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
asm volatile("sub.f16x2 %0, %1, %2;\n"
: "=r"(B_loaded_fp16.x)
: "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
Expand All @@ -116,10 +120,11 @@ __global__ void __launch_bounds__(64)
: "=r"(B_loaded_fp16.w)
: "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));

*(uint4 *)B_shared_ptr2 = B_loaded_fp16;

// 直接写回全局内存输出
half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
#pragma unroll
for (int i = 0; i < 8; ++i) {
*(C_ptr2 + i) = B_shared[i];
C_ptr2[i] = out_vec[i];
}
}
#endif
Expand Down Expand Up @@ -183,8 +188,8 @@ Descriptor::calculate(
half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));

dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
qweight_, scales_, zeros_, out_, group_size);
dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
qweight_, scales_, zeros_, out_, group_size, out_features, in_features);

return INFINI_STATUS_SUCCESS;
}
Expand Down
55 changes: 55 additions & 0 deletions src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef __DEQUANTIZE_GPTQ_H__
#define __DEQUANTIZE_GPTQ_H__

#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::dequantize_gptq::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
DequantizeGPTQInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
size_t workspace_size_, \
Opaque *opaque, \
DequantizeGPTQInfo info, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t out_desc, \
infiniopTensorDescriptor_t qweight_desc, \
infiniopTensorDescriptor_t scales_desc, \
infiniopTensorDescriptor_t zeros_desc, \
infiniopTensorDescriptor_t g_idx_desc); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *out, \
const void *qweight, \
const void *scales, \
const void *zeros, \
const void *g_idx, \
void *stream) const; \
}; \
}

#endif //__DEQUANTIZE_GPTQ_H__
Loading