InfiniTensor · scbz4learning · Jan 11, 2026 · Jan 11, 2026
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -9,6 +9,7 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"

diff --git a/include/infiniop/ops/dequantize_gptq.h b/include/infiniop/ops/dequantize_gptq.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_DEQUANTIZE_GPTQ_API_H__
+#define __INFINIOP_DEQUANTIZE_GPTQ_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
+                                                                  infiniopDequantizeGPTQDescriptor_t *desc_ptr,
+                                                                  infiniopTensorDescriptor_t out_desc,
+                                                                  infiniopTensorDescriptor_t qweight_desc,
+                                                                  infiniopTensorDescriptor_t scales_desc,
+                                                                  infiniopTensorDescriptor_t zeros_desc,
+                                                                  infiniopTensorDescriptor_t g_idx_desc); // add g_idx
+
+__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
+                                                  void *workspace,
+                                                  size_t workspace_size,
+                                                  void *out,
+                                                  const void *qweight,
+                                                  const void *scales,
+                                                  const void *zeros,
+                                                  const void *g_idx,  // add g_idx
+                                                  void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
@@ -17,7 +17,8 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        #"dequantize_awq.py",
+        "dequantize_awq.py",
+        "dequantize_gptq.py",
         "gelu.py",
         "gemm.py",
         #"layer_norm.py",

diff --git a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
@@ -7,7 +7,7 @@
 #include <cuda_fp16.h>
 
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
                        int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
@@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -119,7 +119,7 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
         qweight_, scales_, zeros_, out_, group_size);
     return INFINI_STATUS_SUCCESS;
 }

diff --git a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
@@ -11,7 +11,7 @@
  * @param source 输入的32位无符号整数，它打包了8个4-bit的数据。
  * @return 一个 uint4 变量，其中包含8个反量化后的 half 值。
  */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
     // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。

diff --git a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h
@@ -11,7 +11,7 @@
  * @param source 输入的32位无符号整数，它打包了8个4-bit的数据。
  * @return 一个 uint4 变量，其中包含8个反量化后的 half 值。
  */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
     // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。

diff --git a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu
@@ -7,7 +7,7 @@
 #include <musa_fp16.h>
 
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
                        int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
@@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -119,7 +119,7 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
         qweight_, scales_, zeros_, out_, group_size);
     return INFINI_STATUS_SUCCESS;
 }

diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，

diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
@@ -10,33 +10,36 @@
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G) {
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
+                       int *__restrict__ zeros, half *__restrict__ C, int G,
+                       int out_features, int in_features) {
     // static constexpr uint32_t ZERO = 0x0;
-    half B_shared[32 * (128 + 8)];
 
-    half *B_shared_ptr2 = B_shared;
-
-    int N = blockDim.x * gridDim.x; // 2
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = (blockIdx.y * blockDim.y + threadIdx.y);
-    int index1 = 8 * col + 8 * row * N;
+
+    // 边界检查，防止越界访问
+    if (col >= out_features || row >= in_features) return;
+
+    // 每个元素在输出中的起始地址：行主序，连续 8 个 half
+    int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
 
-    int index2 = col + row * N;
+    int index2 = col + row * out_features;
     int *B_ptr2 = B + index2;
 
-    int index3 = col + (int)(row / G) * N;
+    int index3 = col + (int)(row / G) * out_features;
     int *zeros_ptr2 = zeros + index3;
-    int index4 = 8 * col + (int)(row / G) * N * 8;
+
+    int index4 = 8 * col + (int)(row / G) * out_features * 8;
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -55,42 +58,43 @@ __global__ void __launch_bounds__(64)
     B_loaded_fp16_h2[2] = __hfma2(B_loaded_fp16_h2[2], B_loaded_scale_h2[2], __float2half2_rn(0.0f));
     B_loaded_fp16_h2[3] = __hfma2(B_loaded_fp16_h2[3], B_loaded_scale_h2[3], __float2half2_rn(0.0f));
 
-    // Store back to shared memory
-    *(uint4 *)B_shared_ptr2 = B_loaded_fp16;
-
+    // 直接写回全局内存输出
+    half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
+    #pragma unroll
     for (int i = 0; i < 8; ++i) {
-        *(C_ptr2 + i) = B_shared[i];
+        C_ptr2[i] = out_vec[i];
     }
 }
 #else
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int group_size) {
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
+                       int *__restrict__ zeros, half *__restrict__ C, int group_size,
+                       int out_features, int in_features) {
     static constexpr uint32_t ZERO = 0x0;
-    half B_shared[32 * (128 + 8)];
 
-    half *B_shared_ptr2 = B_shared;
-
-    int N = blockDim.x * gridDim.x; // 2
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int index1 = 8 * col + 8 * row * N;
+
+    // 边界检查，防止越界访问
+    if (col >= out_features || row >= in_features) return;
+
+    int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
 
-    int index2 = col + row * N;
+    int index2 = col + row * out_features;
     int *B_ptr2 = B + index2;
 
-    int index3 = col + (int)(row / group_size) * N;
+    int index3 = col + (int)(row / group_size) * out_features;
     int *zeros_ptr2 = zeros + index3;
-    int index4 = 8 * col + (int)(row / group_size) * N * 8;
+    int index4 = 8 * col + (int)(row / group_size) * out_features * 8;
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
     asm volatile("sub.f16x2 %0, %1, %2;\n"
                  : "=r"(B_loaded_fp16.x)
                  : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
@@ -116,10 +120,11 @@ __global__ void __launch_bounds__(64)
                  : "=r"(B_loaded_fp16.w)
                  : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
 
-    *(uint4 *)B_shared_ptr2 = B_loaded_fp16;
-
+    // 直接写回全局内存输出
+    half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
+    #pragma unroll
     for (int i = 0; i < 8; ++i) {
-        *(C_ptr2 + i) = B_shared[i];
+        C_ptr2[i] = out_vec[i];
     }
 }
 #endif
@@ -183,8 +188,8 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-        qweight_, scales_, zeros_, out_, group_size);
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
+        qweight_, scales_, zeros_, out_, group_size, out_features, in_features);
 
     return INFINI_STATUS_SUCCESS;
 }

diff --git a/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h b/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
@@ -0,0 +1,55 @@
+#ifndef __DEQUANTIZE_GPTQ_H__
+#define __DEQUANTIZE_GPTQ_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::dequantize_gptq::NAMESPACE {                    \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        DequantizeGPTQInfo _info;                                 \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            DequantizeGPTQInfo info,                              \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t qweight_desc,             \
+            infiniopTensorDescriptor_t scales_desc,              \
+            infiniopTensorDescriptor_t zeros_desc,               \
+            infiniopTensorDescriptor_t g_idx_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *out,                                           \
+            const void *qweight,                                 \
+            const void *scales,                                  \
+            const void *zeros,                                   \
+            const void *g_idx,                                  \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif //__DEQUANTIZE_GPTQ_H__