microsoft · LeiWang1999 · Feb 9, 2025 · Feb 2, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/3rdparty/tilelang b/3rdparty/tilelang
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/base/arch/cuda.py b/bitblas/base/arch/cuda.py
@@ -27,7 +27,7 @@ def is_volta_arch(arch: TileDevice) -> bool:
 def is_ampere_arch(arch: TileDevice) -> bool:
     conditions = [True]
     conditions.append(is_cuda_arch(arch))
-    conditions.append(arch.sm_version >= 80 and arch.sm_version < 90)
+    conditions.append(arch.sm_version >= 80 and arch.sm_version < 89)
     return all(conditions)
 
 

diff --git a/bitblas/builder/wrapper/tl.py b/bitblas/builder/wrapper/tl.py
@@ -18,8 +18,8 @@
 class TLCUDASourceWrapper(object):
     _TYPE_MAP = {
         "float32": "float",
-        "float16": "half",
-        "bfloat16": "__nv_bfloat16",
+        "float16": "half_t",
+        "bfloat16": "bfloat16_t",
         "e4m3_float8": "__nv_fp8_e4m3",
         "e5m2_float8": "__nv_fp8_e5m2",
         "float64": "double",

diff --git a/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py
@@ -649,7 +649,9 @@ def check_last_trait(region: List[Range]):
     if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70:
         in_dtype, out_dtype = get_in_out_dtypes(block_stmt)
         if not is_tensorcore_supported_precision(in_dtype, out_dtype, arch=get_arch(target)):
-            logger.debug("The input and output dtype is not supported by tensorcore")
+            logger.debug(
+                f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore"
+            )
             return func, None
 
         # reindex and transform functions

diff --git a/bitblas/ops/general_matmul/tilelang/dense/__init__.py b/bitblas/ops/general_matmul/tilelang/dense/__init__.py
@@ -102,7 +102,7 @@ def ampere_select_scheduler(
 
     trans_A, trans_B = parse_layout(layout)
 
-    def can_apply_fine_grain_scheduler(trans_A, trans_B, propagate_a, propagate_b):
+    def can_apply_mma_scheduler(trans_A, trans_B, propagate_a, propagate_b):
         conditions = []
         conditions.append(trans_A is False)
         conditions.append(trans_B is True)
@@ -116,7 +116,7 @@ def can_apply_block_scheduler(propagate_a, propagate_b):
         conditions.append(propagate_b == TransformKind.NonTransform)
         return all(conditions)
 
-    def can_apply_weight_propagation_scheduler(trans_A, trans_B, propagate_a, propagate_b):
+    def can_apply_mma_weight_propagation_scheduler(trans_A, trans_B, propagate_a, propagate_b):
         conditions = []
         conditions.append(trans_A is False)
         conditions.append(trans_B is True)
@@ -127,7 +127,7 @@ def can_apply_weight_propagation_scheduler(trans_A, trans_B, propagate_a, propag
     def is_int4_dtype(dtype):
         return dtype == "int4" or dtype == "uint4"
 
-    if can_apply_weight_propagation_scheduler(trans_A, trans_B, propagate_a, propagate_b):
+    if can_apply_mma_weight_propagation_scheduler(trans_A, trans_B, propagate_a, propagate_b):
         Scheduler = MatmulMMAWeightPropagationScheduler if not is_int4_dtype(
             in_dtype) else MatmulINT4MMAWeightPropagationScheduler
         return Scheduler(
@@ -141,7 +141,7 @@ def is_int4_dtype(dtype):
             accum_dtype=accum_dtype,
             with_bias=with_bias,
         )
-    if can_apply_fine_grain_scheduler(trans_A, trans_B, propagate_a, propagate_b):
+    if can_apply_mma_scheduler(trans_A, trans_B, propagate_a, propagate_b):
         Scheduler = MatmulMMAScheduler if not is_int4_dtype(in_dtype) else MatmulINT4MMAScheduler
         return Scheduler(
             M=M,

diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul.py b/bitblas/ops/general_matmul/tilelang/dense/matmul.py
@@ -9,8 +9,11 @@
     TileDevice,
     is_ampere_arch,
     is_volta_arch,
+    is_ada_arch,
+    is_hopper_arch,
     is_tensorcore_supported_precision,
 )
+from tilelang.intrinsics.utils import get_mma_micro_size
 from dataclasses import dataclass
 from bitblas.tl.base_hint import BaseTLHint
 
@@ -39,20 +42,20 @@ class MatmulScheduler(MatmulBaseParams):
     gemv_scheduler: Optional[GemvFineGrainSIMTScheduler] = None
     matmul_simt_scheduler: Optional[MatmulFineGrainSIMTScheduler] = None
     matmul_block_scheduler: Optional[MatmulTileLibraryScheduler] = None
-    matmul_fine_grain_scheduler: Optional[MatmulMMAScheduler] = None
-    matmul_weight_propagation_scheduler: Optional[MatmulMMAWeightPropagationScheduler] = None
-    matmul_int4_fine_grain_scheduler: Optional[MatmulINT4MMAScheduler] = None
-    matmul_int4_weight_propagation_scheduler: Optional[
+    matmul_mma_scheduler: Optional[MatmulMMAScheduler] = None
+    matmul_mma_weight_propagation_scheduler: Optional[MatmulMMAWeightPropagationScheduler] = None
+    matmul_int4_mma_scheduler: Optional[MatmulINT4MMAScheduler] = None
+    matmul_int4_mma_weight_propagation_scheduler: Optional[
         MatmulINT4MMAWeightPropagationScheduler] = None
 
     def __init__(self, **kwargs):
         self.gemv_scheduler = GemvFineGrainSIMTScheduler(**kwargs)
         self.matmul_simt_scheduler = MatmulFineGrainSIMTScheduler(**kwargs)
         self.matmul_block_scheduler = MatmulTileLibraryScheduler(**kwargs)
-        self.matmul_fine_grain_scheduler = MatmulMMAScheduler(**kwargs)
-        self.matmul_weight_propagation_scheduler = MatmulMMAWeightPropagationScheduler(**kwargs)
-        self.matmul_int4_fine_grain_scheduler = MatmulINT4MMAScheduler(**kwargs)
-        self.matmul_int4_weight_propagation_scheduler = MatmulINT4MMAWeightPropagationScheduler(
+        self.matmul_mma_scheduler = MatmulMMAScheduler(**kwargs)
+        self.matmul_mma_weight_propagation_scheduler = MatmulMMAWeightPropagationScheduler(**kwargs)
+        self.matmul_int4_mma_scheduler = MatmulINT4MMAScheduler(**kwargs)
+        self.matmul_int4_mma_weight_propagation_scheduler = MatmulINT4MMAWeightPropagationScheduler(
             **kwargs)
         super().__init__(**kwargs)
 
@@ -72,14 +75,13 @@ def dispatch_ampere_scheduler(self, arch: TileDevice) -> BaseScheduler:
             if is_tensorcore_supported_precision(in_dtype, accum_dtype, arch):
                 if weight_transform_kind != TransformKind.NonTransform:
                     # INT4 Can be fused into general dequantize
-                    return self.matmul_int4_weight_propagation_scheduler if in_dtype == "int4" else self.matmul_weight_propagation_scheduler
-                return self.matmul_int4_fine_grain_scheduler if in_dtype == "int4" else self.matmul_fine_grain_scheduler
+                    return self.matmul_int4_mma_weight_propagation_scheduler if in_dtype == "int4" else self.matmul_mma_weight_propagation_scheduler
+                return self.matmul_int4_mma_scheduler if in_dtype == "int4" else self.matmul_mma_scheduler
             else:
                 return self.matmul_simt_scheduler
         else:
-            minimal_tensorcore_threshold: List[int, int,
-                                               int] = [8, 16, 32
-                                                      ] if accum_dtype == "int32" else [8, 16, 16]
+            _, _, micro_size_k = get_mma_micro_size(in_dtype)
+            minimal_tensorcore_threshold: List[int, int, int] = [8, 16, micro_size_k]
             if minimal_tensorcore_threshold[0] > M or minimal_tensorcore_threshold[
                     1] > N or minimal_tensorcore_threshold[2] > K:
                 if in_dtype == "int4":
@@ -90,10 +92,11 @@ def dispatch_ampere_scheduler(self, arch: TileDevice) -> BaseScheduler:
                 return self.gemv_scheduler
             elif is_tensorcore_supported_precision(in_dtype, accum_dtype, arch):
                 if self.weight_transform_kind != TransformKind.NonTransform:
-                    return (self.matmul_int4_weight_propagation_scheduler
-                            if in_dtype == "int4" else self.matmul_weight_propagation_scheduler)
+                    return (self.matmul_int4_mma_weight_propagation_scheduler
+                            if in_dtype == "int4" else self.matmul_mma_weight_propagation_scheduler)
                 else:
-                    return self.matmul_int4_fine_grain_scheduler if in_dtype == "int4" else self.matmul_block_scheduler
+                    # by default, use the mma_scheduler
+                    return self.matmul_int4_mma_scheduler if in_dtype == "int4" else self.matmul_mma_scheduler
             else:
                 return self.matmul_simt_scheduler
 
@@ -131,7 +134,10 @@ def dispatch_volta_scheduler(self, arch: TileDevice) -> BaseScheduler:
                 return self.matmul_simt_scheduler
 
     def dispatch_scheduler(self, arch: TileDevice) -> BaseScheduler:
-        if is_ampere_arch(arch):
+        if is_hopper_arch(arch):
+            logger.warning("Hopper architecture is not fully supported yet, fallback to Ada")
+            return self.dispatch_ampere_scheduler(arch)
+        elif is_ampere_arch(arch) or is_ada_arch(arch):
             return self.dispatch_ampere_scheduler(arch)
         elif is_volta_arch(arch):
             return self.dispatch_volta_scheduler(arch)
@@ -143,10 +149,10 @@ def detect_scheduler_from_hint(self, hint: BaseTLHint) -> BaseScheduler:
                 self.gemv_scheduler,
                 self.matmul_simt_scheduler,
                 self.matmul_block_scheduler,
-                self.matmul_fine_grain_scheduler,
-                self.matmul_weight_propagation_scheduler,
-                self.matmul_int4_fine_grain_scheduler,
-                self.matmul_int4_weight_propagation_scheduler,
+                self.matmul_mma_scheduler,
+                self.matmul_mma_weight_propagation_scheduler,
+                self.matmul_int4_mma_scheduler,
+                self.matmul_int4_mma_weight_propagation_scheduler,
         ]:
             try:
                 scheduler_hint_type = scheduler.get_hint_type()
@@ -213,10 +219,10 @@ def set_dynamic_range(self, dynamic_range: Dict[str, int]) -> "BaseScheduler":
                 self.gemv_scheduler,
                 self.matmul_simt_scheduler,
                 self.matmul_block_scheduler,
-                self.matmul_fine_grain_scheduler,
-                self.matmul_weight_propagation_scheduler,
-                self.matmul_int4_fine_grain_scheduler,
-                self.matmul_int4_weight_propagation_scheduler,
+                self.matmul_mma_scheduler,
+                self.matmul_mma_weight_propagation_scheduler,
+                self.matmul_int4_mma_scheduler,
+                self.matmul_int4_mma_weight_propagation_scheduler,
         ]:
             scheduler.set_dynamic_range(dynamic_range)
         return self
@@ -227,10 +233,10 @@ def with_arch(self, arch):
                 self.gemv_scheduler,
                 self.matmul_simt_scheduler,
                 self.matmul_block_scheduler,
-                self.matmul_fine_grain_scheduler,
-                self.matmul_weight_propagation_scheduler,
-                self.matmul_int4_fine_grain_scheduler,
-                self.matmul_int4_weight_propagation_scheduler,
+                self.matmul_mma_scheduler,
+                self.matmul_mma_weight_propagation_scheduler,
+                self.matmul_int4_mma_scheduler,
+                self.matmul_int4_mma_weight_propagation_scheduler,
         ]:
             scheduler.with_arch(arch)
         return self

diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul_mma.py b/bitblas/ops/general_matmul/tilelang/dense/matmul_mma.py
@@ -7,7 +7,7 @@
 from tvm import DataType
 import tilelang.language as T
 from typing import Optional, List
-from bitblas.tl.utils import (
+from tilelang.intrinsics.utils import (
     get_mma_micro_size,
     make_mma_swizzle_layout as make_swizzle_layout,
 )

diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul_tile.py b/bitblas/ops/general_matmul/tilelang/dense/matmul_tile.py
@@ -6,7 +6,7 @@
 from bitblas import tilelang as tilelang
 import tilelang.language as T
 from typing import Optional, List
-from bitblas.tl.utils import (
+from tilelang.intrinsics.utils import (
     get_mma_micro_size,
     make_mma_swizzle_layout as make_swizzle_layout,
 )
+7 −7		.github/workflows/ci.yml
+1 −1		3rdparty/tvm
+1 −1		README.md
+18 −22		src/target/codegen_cuda.cc
+34 −1		src/tl_templates/cuda/common.h
+2 −2		testing/python/jit/test_tilelang_jit_callback.py
+2 −2		testing/python/jit/test_tilelang_jit_gemm.py
+236 −0		testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+1 −1		testing/python/kernel/test_tilelang_kernel_dequantize_gemm.py
+239 −0		testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+185 −0		testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+45 −19		testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+192 −0		testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+0 −0		testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+81 −0		tilelang/contrib/dlpack.py
+1 −1		tilelang/intrinsics/utils.py
+4 −4		tilelang/jit/__init__.py
+1 −1		tilelang/jit/adapter/__init__.py
+1 −1		tilelang/jit/adapter/dlpack.py
+5 −5		tilelang/jit/kernel.py
+3 −4		tilelang/profiler/__init__.py
+5 −0		tilelang/transform/simplify.py
+37 −1		tilelang/utils/tensor.py
+37 −8		include/tvm/runtime/ndarray.h
+62 −2		python/tvm/runtime/ndarray.py
+41 −82		src/runtime/ndarray.cc