Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
a1e7c81
Refactor: Rename tensor core related classes and imports to use tile …
LeiWang1999 Feb 2, 2025
566fc67
lint fix
LeiWang1999 Feb 2, 2025
563a704
Fix: Update Matmul initialization to specify backend and clean up imp…
LeiWang1999 Feb 2, 2025
d95fb42
lint fix
LeiWang1999 Feb 2, 2025
97ecf57
Update subproject commit for TVM dependency
LeiWang1999 Feb 2, 2025
fe44dd7
Refactor: Update imports to use tilelang instead of tvm.tl.language
LeiWang1999 Feb 3, 2025
7dc068c
Refactor: Clean up import statements and formatting in bitblas module
LeiWang1999 Feb 3, 2025
691a0dc
Fix: Add newline injection to .bashrc if the last line is not empty i…
LeiWang1999 Feb 3, 2025
fd4c1a6
Update submodule URLs and branches for TVM and TileLang
LeiWang1999 Feb 3, 2025
e7ad6a9
Update tilelang submodule URL and add new subproject commit
LeiWang1999 Feb 3, 2025
6da2f5f
Update cutlass submodule URL to point to tile-ai repository
LeiWang1999 Feb 3, 2025
30f1a95
Merge branch 'main' of https://github.com/microsoft/BitBLAS into sepa…
LeiWang1999 Feb 3, 2025
8f68896
Refactor: Split class definition for MatmulINT4DequantizeMMAWeightPro…
LeiWang1999 Feb 3, 2025
af0a134
Enhance environment variable handling for TVM and TileLang paths in i…
LeiWang1999 Feb 3, 2025
b9bb657
Remove unnecessary blank line in initialization of TILELANG_IMPORT_PA…
LeiWang1999 Feb 3, 2025
9f7d4c6
Add build_tilelang function to setup.py for TILELANG integration
LeiWang1999 Feb 4, 2025
1cc6886
Add TILELANG build step in setup.py
LeiWang1999 Feb 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
[submodule "3rdparty/tvm"]
path = 3rdparty/tvm
url = https://github.com/TileLang/tvm.git
branch = upstream
url = https://github.com/tile-ai/tvm.git
branch = tilelang_codebase
[submodule "3rdparty/tilelang"]
path = 3rdparty/tilelang
url = https://github.com/tile-ai/tilelang
branch = bitblas
[submodule "3rdparty/cutlass"]
path = 3rdparty/cutlass
url = https://github.com/TileLang/cutlass
url = https://github.com/tile-ai/cutlass
branch = tldev
1 change: 1 addition & 0 deletions 3rdparty/tilelang
Submodule tilelang added at e3b185
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated 82 files
+0 −20 python/tvm/tl/__init__.py
+0 −21 python/tvm/tl/_ffi_api.py
+0 −184 python/tvm/tl/autotuner.py
+0 −291 python/tvm/tl/engine.py
+0 −359 python/tvm/tl/language.py
+0 −94 python/tvm/tl/layout.py
+0 −173 python/tvm/tl/transform.py
+0 −392 python/tvm/tl/utils.py
+22 −4 src/script/ir_builder/tir/ir.cc
+0 −18 src/tir/analysis/block_access_region_detector.cc
+0 −381 src/tir/transforms/thread_partial_sync.cc
+0 −4 src/tir/transforms/thread_storage_sync.cc
+0 −162 src/tl/ir.cc
+0 −475 src/tl/layout/gemm_layouts.cc
+0 −417 src/tl/layout/layout.cc
+0 −177 src/tl/layout/layout.h
+0 −116 src/tl/layout/swizzle.cc
+0 −91 src/tl/layout/swizzle.h
+0 −262 src/tl/layout/utils.cc
+0 −76 src/tl/layout/utils.h
+0 −106 src/tl/op/builtin.cc
+0 −168 src/tl/op/builtin.h
+0 −393 src/tl/op/bulk_copy.cc
+0 −82 src/tl/op/bulk_copy.h
+0 −370 src/tl/op/elem.cc
+0 −83 src/tl/op/elem.h
+0 −272 src/tl/op/gemm.cc
+0 −64 src/tl/op/gemm.h
+0 −102 src/tl/op/op.cc
+0 −114 src/tl/op/op.h
+0 −247 src/tl/op/parallel.cc
+0 −88 src/tl/op/parallel.h
+0 −230 src/tl/op/reduce.cc
+0 −62 src/tl/op/reduce.h
+0 −203 src/tl/runtime/runtime.cc
+0 −37 src/tl/runtime/runtime.h
+0 −1,556 src/tl/target/codegen_cuda.cc
+0 −107 src/tl/target/codegen_cuda.h
+0 −1,279 src/tl/target/codegen_hip.cc
+0 −105 src/tl/target/codegen_hip.h
+0 −24,359 src/tl/target/cuda.h
+0 −109 src/tl/target/rt_mod_cuda.cc
+0 −190 src/tl/target/rt_mod_hip.cc
+0 −109 src/tl/target/utils.cc
+0 −50 src/tl/target/utils.h
+0 −78 src/tl/tl_templates/cuda/common.h
+0 −73 src/tl/tl_templates/cuda/copy.h
+0 −227 src/tl/tl_templates/cuda/copy_sm90.h
+0 −10 src/tl/tl_templates/cuda/gemm.h
+0 −160 src/tl/tl_templates/cuda/gemm_sm70.h
+0 −314 src/tl/tl_templates/cuda/gemm_sm80.h
+0 −218 src/tl/tl_templates/cuda/gemm_sm90.h
+0 −100 src/tl/tl_templates/cuda/ldsm.h
+0 −55 src/tl/tl_templates/cuda/reduce.h
+0 −39 src/tl/tl_templates/cuda/threadblock_swizzle.h
+0 −53 src/tl/tl_templates/hip/common.h
+0 −101 src/tl/tl_templates/hip/copy.h
+0 −218 src/tl/tl_templates/hip/gemm.h
+0 −3 src/tl/tl_templates/hip/ldsm.h
+0 −56 src/tl/tl_templates/hip/reduce.h
+0 −41 src/tl/tl_templates/hip/threadblock_swizzle.h
+0 −133 src/tl/transform/cluster_planning.cc
+0 −205 src/tl/transform/common/loop_fusion_utils.h
+0 −731 src/tl/transform/common/loop_vectorization_utils.h
+0 −94 src/tl/transform/frontend_legalize.cc
+0 −174 src/tl/transform/inject_fence_proxy.cc
+0 −242 src/tl/transform/inject_mbarrier.cc
+0 −934 src/tl/transform/inject_pipeline.cc
+0 −300 src/tl/transform/layout_inference.cc
+0 −282 src/tl/transform/legalize_safe_memory_access.cc
+0 −93 src/tl/transform/legalize_vectorized_loop.cc
+0 −164 src/tl/transform/loop_partition.cc
+0 −46 src/tl/transform/loop_partition.h
+0 −305 src/tl/transform/loop_vectorize.cc
+0 −45 src/tl/transform/loop_vectorize.h
+0 −157 src/tl/transform/lower_hopper_intrin.cc
+0 −327 src/tl/transform/lower_tile_op.cc
+0 −321 src/tl/transform/multi_version_buffer_rewriter.cc
+0 −249 src/tl/transform/pipeline_planning.cc
+0 −475 src/tl/transform/simplify.cc
+0 −849 src/tl/transform/warp_specialized_pipeline.cc
+0 −941 src/tl/transform/warp_specialized_rewriter.cc
21 changes: 20 additions & 1 deletion bitblas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def new_func(*args, **kwargs):

if TVM_IMPORT_PYTHON_PATH is not None:
os.environ["PYTHONPATH"] = (TVM_IMPORT_PYTHON_PATH + ":" + os.environ.get("PYTHONPATH", ""))
sys.path.insert(0, TVM_IMPORT_PYTHON_PATH + "/python")
sys.path.insert(0, TVM_IMPORT_PYTHON_PATH)
else:
# remove the existing tvm path in PYTHONPATH
def remove_tvm_path(path):
Expand All @@ -107,6 +107,7 @@ def remove_tvm_path(path):
os.environ["PYTHONPATH"] = (
install_tvm_path + "/python:" + os.environ.get("PYTHONPATH", ""))
sys.path.insert(0, install_tvm_path + "/python")
os.environ["TVM_IMPORT_PYTHON_PATH"] = install_tvm_path + "/python"

# developed 3rdparty tvm
develop_tvm_path = os.path.join(
Expand All @@ -119,6 +120,22 @@ def remove_tvm_path(path):
os.environ["PYTHONPATH"] = (
develop_tvm_path + "/python:" + os.environ.get("PYTHONPATH", ""))
sys.path.insert(0, develop_tvm_path + "/python")
os.environ["TVM_IMPORT_PYTHON_PATH"] = develop_tvm_path + "/python"

# TILELANG PATH
if os.environ.get("TILELANG_IMPORT_PATH", None) is None:
install_tilelang_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "3rdparty", "tilelang")
develop_tilelang_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "tilelang")
if os.path.exists(install_tilelang_path):
os.environ["PYTHONPATH"] = install_tilelang_path + ":" + os.environ.get("PYTHONPATH", "")
sys.path.insert(0, install_tilelang_path)
elif (os.path.exists(develop_tilelang_path) and develop_tilelang_path not in sys.path):
os.environ["PYTHONPATH"] = develop_tilelang_path + ":" + os.environ.get("PYTHONPATH", "")
sys.path.insert(0, develop_tilelang_path)
else:
logger.warning(TL_TEMPLATE_NOT_FOUND_MESSAGE)

if os.environ.get("TL_CUTLASS_PATH", None) is None:
install_cutlass_path = os.path.join(
Expand All @@ -133,6 +150,8 @@ def remove_tvm_path(path):
logger.warning(CUTLASS_NOT_FOUND_MESSAGE)

import tvm as tvm # noqa: E402
import tilelang as tilelang # noqa: E402

from .base import (
TileDevice, # noqa: F401
fast_tune, # noqa: F401
Expand Down
6 changes: 5 additions & 1 deletion bitblas/base/base_scheduler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from tvm import te
from tvm import IRModule
from tvm.tir import PrimFunc
from typing import Optional, Union, Callable, List, Dict
from dataclasses import dataclass, field
from tvm.tl.transform import Simplify
from tilelang.transform import Simplify
from abc import ABC, abstractmethod
from bitblas.base.arch import TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch
from bitblas.base.roller.hint import Hint
Expand Down
18 changes: 9 additions & 9 deletions bitblas/builder/lib_generator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,24 +67,24 @@ def compile_lib(self, timeout: float = None, with_tl: bool = False):
raise ValueError(f"Unsupported platform: {platform}")

if with_tl:
install_tvm_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../..", "3rdparty", "tvm")
develop_tvm_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../../..", "3rdparty", "tvm")
install_tilelang_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../..", "3rdparty", "tilelang")
develop_tilelang_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../../..", "3rdparty", "tilelang")

tvm_root = next((path for path in [install_tvm_path, develop_tvm_path]
if os.path.exists(path) and path not in sys.path), None)
tilelang_root = next((path for path in [install_tilelang_path, develop_tilelang_path]
if os.path.exists(path) and path not in sys.path), None)

if "TL_TEMPLATE_PATH " in os.environ:
tl_template_path = os.environ["TL_TEMPLATE_PATH"]
else:
tl_template_path = osp.abspath(osp.join(tvm_root, "src/tl"))
tl_template_path = osp.abspath(osp.join(tilelang_root, "src"))

tl_template_path = osp.abspath(osp.join(tvm_root, "src/tl"))
tl_template_path = osp.abspath(osp.join(tilelang_root, "src"))
if "TL_CUTLASS_PATH" in os.environ:
cutlass_path = os.environ["TL_CUTLASS_PATH"]
else:
cutlass_path = osp.abspath(osp.join(tvm_root, "3rdparty/cutlass/include"))
cutlass_path = osp.abspath(osp.join(tilelang_root, "3rdparty/cutlass/include"))

command += [
"-I" + tl_template_path,
Expand Down
2 changes: 2 additions & 0 deletions bitblas/gpu/intrin/hip.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from tvm.runtime import convert
from tvm.tir.expr import Cast, IntImm
from tvm.tir.function import TensorIntrin
Expand Down
3 changes: 2 additions & 1 deletion bitblas/ops/general_flashatten/tilelang/flashatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from bitblas import tvm as tvm
from bitblas.base.base_scheduler import BaseScheduler
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from dataclasses import dataclass
from typing import Optional
import logging
Expand Down
3 changes: 2 additions & 1 deletion bitblas/ops/general_matmul/tilelang/dense/gemv_simt.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from functools import reduce
from typing import Optional, List
import tvm.tl.language as T
import tilelang.language as T
from tvm import DataType
from tvm.tir import PrimFunc

Expand Down
3 changes: 2 additions & 1 deletion bitblas/ops/general_matmul/tilelang/dense/matmul_mma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
# tile represents tile library

from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from tvm import DataType
import tvm.tl.language as T
import tilelang.language as T
from typing import Optional, List
from bitblas.tl.utils import (
get_mma_micro_size,
Expand Down
3 changes: 2 additions & 1 deletion bitblas/ops/general_matmul/tilelang/dense/matmul_simt.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from typing import Optional, List
import tvm.tl.language as T
import tilelang.language as T
from tvm import DataType
from tvm.tir import PrimFunc

Expand Down
3 changes: 2 additions & 1 deletion bitblas/ops/general_matmul/tilelang/dense/matmul_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
# tile represents tile library

from bitblas import tvm as tvm
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Optional, List
from bitblas.tl.utils import (
get_mma_micro_size,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from functools import reduce
from typing import Optional, List
import tvm.tl.language as T
import tilelang.language as T
from tvm import DataType
from tvm.tir import PrimFunc

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# Licensed under the MIT License.
from bitblas import tvm as tvm
from tvm import DataType
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Optional, List
from bitblas.tl.utils import (
get_mma_micro_size, # noqa: F401
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from tvm import DataType
import tvm.tl.language as T
import tilelang.language as T
from typing import Optional, List
from bitblas.tl.utils import (
get_mma_micro_size, # noqa: F401
Expand Down Expand Up @@ -680,7 +681,8 @@ def is_b_smooth(self):


@dataclass
class MatmulINT4DequantizeMMAWeightPropagationScheduler(MatmulDequantizeMMAWeightPropagationScheduler):
class MatmulINT4DequantizeMMAWeightPropagationScheduler(
MatmulDequantizeMMAWeightPropagationScheduler):

class TLHint(MatmulDequantizeMMAWeightPropagationScheduler.TLHint):
hint_type: str = "MatmulINT4DequantizeMMAWeightPropagationScheduler"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from bitblas import tvm as tvm
from tvm import DataType
from tvm.tir import PrimFunc
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Optional, List
from bitblas.base.arch import TileDevice
from bitblas.base.roller.hint import Hint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from bitblas import tvm as tvm
from tvm import DataType
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Optional, List
from bitblas.base.arch import TileDevice
from bitblas.base.roller.hint import Hint
Expand Down
6 changes: 3 additions & 3 deletions bitblas/ops/operator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from abc import ABC, abstractmethod
from bitblas import tvm
from tvm import tl
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
from tvm import IRModule
from tvm.runtime.module import Module
from tvm.target import Target
Expand Down Expand Up @@ -192,7 +192,7 @@ def tvm_callback_hip_postproc(code, _):
if self.is_tir_backend():
rt_mod = tvm.build(self.scheduled_ir_module, target=target)
elif self.is_tilelang_backend():
rt_mod = tl.lower(
rt_mod = tilelang.lower(
self.scheduled_ir_module, target=target, runtime_only=True)
else:
raise ValueError(f"Unsupported backend: {self.backend}")
Expand Down
4 changes: 2 additions & 2 deletions bitblas/tl/mfma_layout.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from bitblas import tilelang as tilelang
from tvm import DataType
import tvm.tl.language as T
import tilelang.language as T
from tvm.runtime import convert


Expand Down
3 changes: 2 additions & 1 deletion bitblas/tl/mfma_macro_generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Tuple
from tvm import DataType
from tvm.tir import PrimExpr
Expand Down
3 changes: 2 additions & 1 deletion bitblas/tl/mma_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from typing import Union
from tvm import arith, DataType
import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T


def ldmatrix_32x8_to_shared_16x16_layout(thread_id, local_id):
Expand Down
3 changes: 2 additions & 1 deletion bitblas/tl/mma_macro_generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Union, Tuple, Optional
from bitblas.base.operator_common import TransformKind
from tvm import DataType
Expand Down
6 changes: 3 additions & 3 deletions bitblas/tl/tuner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from bitblas import tvm
from bitblas import tvm as tvm
from bitblas import tilelang as tilelang
import os
import logging
import tempfile
Expand All @@ -10,7 +11,6 @@
from tvm import IRModule
from tvm.runtime import Module
from tvm.tir import Schedule
import tvm.tl as tl
from bitblas.tl.base_hint import BaseTLHint
from bitblas.base.arch import TileDevice
from bitblas.base.utils import get_dummy_input_arrays
Expand Down Expand Up @@ -122,7 +122,7 @@ def tvm_callback_cuda_postproc(code, _):
"tir.disable_cse_tir": True,
**(config.pass_context if config.pass_context else {})
}):
rt_mod = tl.lower(tl_prim_func, arch.target, runtime_only=True)
rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True)

from tvm.contrib.tar import tar # Import the tar module

Expand Down
3 changes: 2 additions & 1 deletion bitblas/tl/wmma_macro_generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import tvm.tl.language as T
from bitblas import tilelang as tilelang
import tilelang.language as T
from typing import Tuple, Optional
from tvm import DataType
from tvm.tir import PrimExpr
Expand Down
18 changes: 9 additions & 9 deletions bitblas/utils/rtmod_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,27 @@
from tvm.target import Target
from typing import Tuple, List
from tvm import tir
from tvm import tl
from tvm.tl.engine import is_device_call
from bitblas import tilelang as tilelang
from tilelang.engine import is_device_call


def get_annotated_device_mod_from_tl(mod: IRModule, target: Target) -> "IRModule":
target_host = tvm.target.Target("llvm -keys=cpu")
target = tvm.target.Target(target, target_host)
mod = tir.transform.BindTarget(target)(mod)

mod = tl.transform.FrontendLegalize()(mod)
mod = tilelang.transform.FrontendLegalize()(mod)
mod = tir.transform.Simplify()(mod)
mod = tl.transform.LayoutInference()(mod)
mod = tl.transform.LowerTileOp()(mod)
mod = tilelang.transform.LayoutInference()(mod)
mod = tilelang.transform.LowerTileOp()(mod)
mod = tir.transform.Simplify()(mod)

if target.arch == "sm_90":
mod = tl.transform.WarpSpecializedPipeline()(mod)
mod = tilelang.transform.WarpSpecializedPipeline()(mod)
else:
mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
mod = tl.transform.PipelinePlanning()(mod)
mod = tl.transform.InjectSoftwarePipeline()(mod)
mod = tilelang.transform.PipelinePlanning()(mod)
mod = tilelang.transform.InjectSoftwarePipeline()(mod)

mod = tir.transform.LowerOpaqueBlock()(mod)
mod = tir.transform.FlattenBuffer()(mod)
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_annotated_device_mod_from_tl(mod: IRModule, target: Target) -> "IRModule
# the Legalization.
mod = tir.transform.LowerThreadAllreduce()(mod)
mod = tir.transform.ThreadSync("shared.dyn")(mod)
mod = tl.transform.LowerHopperIntrin()(mod)
mod = tilelang.transform.LowerHopperIntrin()(mod)
mod = tir.transform.InjectPTXAsyncCopy()(mod)

mod = tir.transform.AnnotateDeviceRegions()(mod)
Expand Down
Loading