From f43df8473bfad06050a61205b819855f63fb11be Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Wed, 18 Dec 2024 11:33:16 +0800
Subject: [PATCH 01/15] =?UTF-8?q?fix:=20random=20sample=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E4=BD=BF=E7=94=A8=E7=A1=AE=E5=AE=9A=E7=9A=84=E5=88=86=E5=B8=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/main.yaml        |  1 +
 operatorspy/tests/random_sample.py | 36 ++++++++++++++----------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 65731dd1..84108c51 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -23,6 +23,7 @@ jobs:
 
     - name: Install Python dependencies
       run: |
+        pip install numpy
         pip install torch
 
     - name: Install xmake
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index 795c2c1a..ea680c57 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -63,8 +63,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     else:
         end = topk
     
-    
-    
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
@@ -78,12 +76,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
 
 def random_sample_0(data):
     return torch.argmax(data)
+
 def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
     print(
         f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
     )
-    
-    data = torch.rand((voc), dtype=x_dtype).to(torch_device)
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    data = data[_perm].to(x_dtype).to(torch_device)
     if(topp > 0 and topk > 1):
         ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
     else:
@@ -130,12 +130,9 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
     if torch_device == "npu":
         torch.npu.synchronize()
 
-    assert indices[0].type(ans.dtype) == ans or abs(data[indices[0]] - data[ans]) == 0.0, "compute error"
-
-
-    
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
-
+    print("Test passed!")
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
@@ -176,15 +173,16 @@ def test_ascend(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # voc, random_val, topp, topk, temperature
-        (512, 0.92, 0.8, 3, 0.5),
-        (4096, 0.95, 0.9, 5, 1.0),
-        (16384, 0.85, 0.85, 10, 2.0),
-        (512, 0.92, 0, 3, 0.5),
-        (4096, 0.95, 0.9, 1, 1.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (16384, 0.85, 0, 1, 2.0),
-        (32000, 0.8, 0.8, 50, 1.0),
-        (32000, 0.8, 1.0, 25, 1.0),
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
     ]
     
     args = get_args()
@@ -228,4 +226,4 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-    print("Test passed!")
+    print("\033[92mTest passed!\033[0m")

From 4ed33fe9d71777777b4d5fa6cf7ea4ca10d38d0c Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Fri, 20 Dec 2024 14:57:29 +0800
Subject: [PATCH 02/15] fix: add set device id for cuda rope ang swiglu

---
 src/ops/rotary_embedding/cuda/rotary_embedding.cu | 2 ++
 src/ops/swiglu/cuda/swiglu.cu                     | 2 ++
 src/ops/swiglu/cuda/swiglu.cuh                    | 1 +
 src/ops/swiglu/cuda/swiglu_cuda.cc                | 1 +
 4 files changed, 6 insertions(+)

diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
index a5f32a97..62579c3d 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
@@ -53,6 +53,8 @@ infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
     if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
         return STATUS_BAD_PARAM;
 
+    checkCudaError(cudaSetDevice(desc->device_id));
+
     if (dtype_eq(desc->dtype, F16)) {
         rotary_embedding_nv_gpu_f16(desc,
                                     reinterpret_cast<half *>(t),
diff --git a/src/ops/swiglu/cuda/swiglu.cu b/src/ops/swiglu/cuda/swiglu.cu
index a17e994b..c02ce186 100644
--- a/src/ops/swiglu/cuda/swiglu.cu
+++ b/src/ops/swiglu/cuda/swiglu.cu
@@ -59,6 +59,8 @@ infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc,
                             void const *a,
                             void const *b,
                             void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+
     if (dtype_eq(desc->dtype, F16)) {
         swiglu_nv_gpu_f16(desc, c, a, b, stream);
         return STATUS_SUCCESS;
diff --git a/src/ops/swiglu/cuda/swiglu.cuh b/src/ops/swiglu/cuda/swiglu.cuh
index eed0be5b..9b3bdcb5 100644
--- a/src/ops/swiglu/cuda/swiglu.cuh
+++ b/src/ops/swiglu/cuda/swiglu.cuh
@@ -6,6 +6,7 @@
 
 struct SwiGLUCudaDescriptor {
     Device device;
+    int device_id;
     DT dtype;
     uint64_t seq_len;
     uint64_t di;
diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cc b/src/ops/swiglu/cuda/swiglu_cuda.cc
index 1f5eb944..16d70503 100644
--- a/src/ops/swiglu/cuda/swiglu_cuda.cc
+++ b/src/ops/swiglu/cuda/swiglu_cuda.cc
@@ -35,6 +35,7 @@ infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle,
     }
 
     *desc_ptr = new SwiGLUCudaDescriptor{DevNvGpu,
+                                         handle->device_id,
                                          dtype,
                                          seq_len,
                                          di,

From 83d28d2f278d1e42b4e4ce4a78decf7539c7a003 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Fri, 20 Dec 2024 16:41:23 +0800
Subject: [PATCH 03/15] implement matmul

---
 include/device.h                       |  1 +
 operatorspy/devices.py                 |  1 +
 operatorspy/liboperators.py            |  2 +-
 operatorspy/tests/matmul.py            | 41 +++++++++++++++++-
 operatorspy/tests/test_utils.py        |  5 +++
 src/devices/handle.cc                  | 13 ++++++
 src/devices/teco/common_teco.cc        |  1 +
 src/devices/teco/common_teco.h         | 19 ++++++++
 src/devices/teco/teco_handle.cc        | 25 +++++++++++
 src/devices/teco/teco_handle.h         | 18 ++++++++
 src/devices/teco/tensor_teco.cpp       | 23 ++++++++++
 src/devices/teco/tensor_teco.h         | 24 +++++++++++
 src/ops/matmul/operator.cc             | 37 ++++++++++++++++
 src/ops/matmul/teco/matmul_tecoblas.cc | 44 +++++++++++++++++++
 src/ops/matmul/teco/matmul_tecoblas.h  | 42 ++++++++++++++++++
 src/ops/utils.h                        |  1 +
 wget-log                               | 11 +++++
 xmake.lua                              | 60 +++++++++++++++++++++++++-
 18 files changed, 364 insertions(+), 4 deletions(-)
 create mode 100644 src/devices/teco/common_teco.cc
 create mode 100644 src/devices/teco/common_teco.h
 create mode 100644 src/devices/teco/teco_handle.cc
 create mode 100644 src/devices/teco/teco_handle.h
 create mode 100644 src/devices/teco/tensor_teco.cpp
 create mode 100644 src/devices/teco/tensor_teco.h
 create mode 100644 src/ops/matmul/teco/matmul_tecoblas.cc
 create mode 100644 src/ops/matmul/teco/matmul_tecoblas.h
 create mode 100644 wget-log

diff --git a/include/device.h b/include/device.h
index 701b6632..4f922fc4 100644
--- a/include/device.h
+++ b/include/device.h
@@ -6,6 +6,7 @@ enum DeviceEnum {
     DevNvGpu,
     DevCambriconMlu,
     DevAscendNpu,
+    DevTecoSDAA,
 };
 
 typedef enum DeviceEnum Device;
diff --git a/operatorspy/devices.py b/operatorspy/devices.py
index 4984502a..25c3e96a 100644
--- a/operatorspy/devices.py
+++ b/operatorspy/devices.py
@@ -3,3 +3,4 @@ class DeviceEnum:
     DEVICE_CUDA = 1
     DEVICE_BANG = 2
     DEVICE_ASCEND = 3
+    DEVICE_TECO = 4
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index b1e78fe6..2231e2c0 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -43,6 +43,7 @@ def find_library_in_ld_path(library_name):
         paths = ld_library_path.split(os.pathsep)
         for path in paths:
             full_path = os.path.join(path, library_name)
+            print(full_path)
             if os.path.isfile(full_path):
                 return full_path
         return None
@@ -53,7 +54,6 @@ def find_library_in_ld_path(library_name):
         library_path = find_library_in_ld_path("operators.dll")
     elif system_name == "Linux":
         library_path = find_library_in_ld_path("liboperators.so")
-
     assert (
         library_path is not None
     ), f"Cannot find operators.dll or liboperators.so. Check if {LIB_OPERATORS_DIR} is set correctly."
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index 3dc2a9ce..b84644f1 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -59,13 +59,15 @@ def test(
     b = torch.rand(b_shape, dtype=dtype).to(torch_device)
     c = torch.zeros(c_shape, dtype=dtype).to(torch_device)
 
+
     if a_stride is not None:
         a = rearrange_tensor(a, a_stride)
     if b_stride is not None:
+        print(b)
         b = rearrange_tensor(b, b_stride)
+        print(b)
     if c_stride is not None:
         c = rearrange_tensor(c, c_stride)
-
     ans = matmul(c, beta, a, b, alpha)
     
     a_tensor = to_tensor(a, lib)
@@ -101,7 +103,6 @@ def test(
             None,
         )
     )
-
     assert torch.allclose(c, ans, atol=0, rtol=1e-2)
 
     check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
@@ -240,6 +241,40 @@ def test_ascend(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
 if __name__ == "__main__":
     test_cases = [
         # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
@@ -313,4 +348,6 @@ def test_ascend(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     print("Test passed!")
diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py
index a00a91ec..471f2326 100644
--- a/operatorspy/tests/test_utils.py
+++ b/operatorspy/tests/test_utils.py
@@ -22,5 +22,10 @@ def get_args():
         action="store_true",
         help="Run ASCEND NPU test",
     )
+    parser.add_argument(
+        "--teco",
+        action="store_true",
+        help="Run TECO SDAA test",
+    )
 
     return parser.parse_args()
diff --git a/src/devices/handle.cc b/src/devices/handle.cc
index 97126a9d..6f726bcf 100644
--- a/src/devices/handle.cc
+++ b/src/devices/handle.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "./ascend/ascend_handle.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "./teco/teco_handle.h"
+#endif
 
 
 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
@@ -40,6 +43,11 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device d
         case DevAscendNpu: {
             return createAscendHandle((AscendHandle_t *) handle_ptr, device_id);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return createTecoHandle((TecoHandle_t *) handle_ptr, device_id);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -68,6 +76,11 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
         case DevAscendNpu: {
             return deleteAscendHandle((AscendHandle_t) handle);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return deleteTecoHandle((TecoHandle_t) handle);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
new file mode 100644
index 00000000..7724ba5c
--- /dev/null
+++ b/src/devices/teco/common_teco.cc
@@ -0,0 +1 @@
+#include "common_teco.h"
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
new file mode 100644
index 00000000..565d9841
--- /dev/null
+++ b/src/devices/teco/common_teco.h
@@ -0,0 +1,19 @@
+#ifndef _COMMON_TECO_
+#define _COMMON_TECO_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include "device.h"
+#define CHECK_TECOBLAS(expression)                                                               \
+    {                                                                                            \
+        tecoblasStatus_t status = (expression);                                                  \
+        if (status != TECOBLAS_STATUS_SUCCESS) {                                                 \
+            fprintf(stderr, "Error at line %d: %s\n", __LINE__, tecoblasGetErrorString(status)); \
+            exit(EXIT_FAILURE);                                                                  \
+        }                                                                                        \
+    }
+
+
+#endif
\ No newline at end of file
diff --git a/src/devices/teco/teco_handle.cc b/src/devices/teco/teco_handle.cc
new file mode 100644
index 00000000..ea3694f1
--- /dev/null
+++ b/src/devices/teco/teco_handle.cc
@@ -0,0 +1,25 @@
+#include "teco_handle.h"
+
+infiniopStatus_t createTecoHandle(TecoHandle_t *handle_ptr, int device_id) {
+    uint32_t device_count;
+    sdaaGetDeviceCount(reinterpret_cast<int*>(&device_count));
+    if (device_id >= static_cast<int>(device_count)) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    sdaaSetDevice(device_id);
+
+    *handle_ptr = new TecoContext{DevTecoSDAA, device_id};
+    tecoblasCreate(&(*handle_ptr)->handle);
+    sdaaStreamCreate(&(*handle_ptr)->stream);
+    tecoblasSetStream((*handle_ptr)->handle,(*handle_ptr)->stream);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteTecoHandle(TecoHandle_t handle_ptr) {
+    sdaaStreamDestroy(handle_ptr->stream);
+    tecoblasDestroy(handle_ptr->handle);
+    delete handle_ptr;
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/teco/teco_handle.h b/src/devices/teco/teco_handle.h
new file mode 100644
index 00000000..583534ec
--- /dev/null
+++ b/src/devices/teco/teco_handle.h
@@ -0,0 +1,18 @@
+#ifndef __TECO_HANDLE__
+#define __TECO_HANDLE__
+#include "common_teco.h"
+#include "status.h"
+#include "../pool.h"
+struct TecoContext {
+    Device device;
+    int device_id;
+    tecoblasHandle_t handle;
+    sdaaStream_t stream;
+};
+typedef struct TecoContext *TecoHandle_t;
+
+infiniopStatus_t createTecoHandle(TecoHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteTecoHandle(TecoHandle_t handle_ptr);
+
+#endif
diff --git a/src/devices/teco/tensor_teco.cpp b/src/devices/teco/tensor_teco.cpp
new file mode 100644
index 00000000..6b3b7575
--- /dev/null
+++ b/src/devices/teco/tensor_teco.cpp
@@ -0,0 +1,23 @@
+#include "tensor_teco.h"
+
+
+infiniopStatus_t tecoTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc) {
+    uint64_t ndim = y->ndim;
+    // Cast shape type
+    auto shape = new std::vector<int64_t>(ndim);
+    auto strides = new std::vector<int64_t>(ndim);
+    for (uint64_t i = 0; i < ndim; ++i) {
+        (*shape)[i] = static_cast<int64_t>(y->shape[i]);
+        (*strides)[i] = y->strides[i];
+    }
+    tecoblasDataType_t dt;
+    if (dtype_eq(y->dt, F16)) {
+        dt = tecoblasDataType_t::TECOBLAS_DATA_FLOAT;
+    } else if (dtype_eq(y->dt, F32)) {
+        dt = aclDataType::TECOBLAS_DATA_DOUBLE;
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/devices/teco/tensor_teco.h b/src/devices/teco/tensor_teco.h
new file mode 100644
index 00000000..afb88e6d
--- /dev/null
+++ b/src/devices/teco/tensor_teco.h
@@ -0,0 +1,24 @@
+#ifndef __TECO_TENSOR__
+#define __TECO_TENSOR__
+
+#include "operators.h"
+#include "tensor.h"
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include <vector>
+
+struct tecoTensorDescriptor {
+    uint64_t ndim;
+    int64_t *shape;
+    int64_t *strides;
+    tecoblasDataType_t data_type;
+    infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
+    infiniopStatus_t createTensor();
+    infiniopStatus_t destroyTensor();
+    ~tecoTensorDescriptor();
+
+};
+
+typedef tecoTensorDescriptor *tecoTensorDescriptor_t;
+
+#endif
\ No newline at end of file
diff --git a/src/ops/matmul/operator.cc b/src/ops/matmul/operator.cc
index 444168b6..52a99e81 100644
--- a/src/ops/matmul/operator.cc
+++ b/src/ops/matmul/operator.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/matmul_aclnn.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/matmul_tecoblas.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
                                                     infiniopMatmulDescriptor_t *desc_ptr,
@@ -48,6 +51,17 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
                                                beta,
                                                1);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoCreateMatmulDescriptor((TecoHandle_t) handle,
+                                               (MatmulTecoDescriptor_t *) desc_ptr,
+                                               c_desc,
+                                               alpha,
+                                               a_desc,
+                                               b_desc,
+                                               beta);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -75,8 +89,15 @@ __C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t d
             return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
                                                size);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoGetMatmulWorkspaceSize((MatmulTecoDescriptor_t) desc,
+                                               size);
+        }
 #endif
     }
+    
     return STATUS_BAD_DEVICE;
 }
 
@@ -104,6 +125,17 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *works
                                a,
                                b,
                                stream);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoMatmul((MatmulTecoDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               c,
+                               a,
+                               b,
+                               stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -130,6 +162,11 @@ __C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t
         case DevAscendNpu: {
             return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyMatmulDescriptor((MatmulTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
new file mode 100644
index 00000000..cd7dcf39
--- /dev/null
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -0,0 +1,44 @@
+#include "matmul_tecoblas.h"
+
+infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
+    *desc_ptr = new MatmulTecoDescriptor{handle->device};
+    (*desc_ptr)->handle = handle->handle;
+    (*desc_ptr)->device = handle->device;
+    (*desc_ptr)->stream = handle->stream;
+    (*desc_ptr)->m = a_desc->shape[0];
+    (*desc_ptr)->k = a_desc->shape[1];
+    (*desc_ptr)->n = b_desc->shape[1];
+    (*desc_ptr)->transa = TECOBLAS_OP_N;
+    (*desc_ptr)->transb = TECOBLAS_OP_N;
+    (*desc_ptr)->lda = a_desc->shape[1];
+    (*desc_ptr)->ldb = b_desc->shape[1];
+    (*desc_ptr)->ldc = c_desc->shape[1];
+    (*desc_ptr)->alpha = 1.0f;
+    (*desc_ptr)->beta = 0.0f;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
+    tecoblasStatus_t status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, 1, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size));
+    if (status != TECOBLAS_STATUS_SUCCESS) {
+        return STATUS_EXECUTION_FAILED;
+    }else{
+        return STATUS_SUCCESS;
+    }
+}
+
+infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, const void *a, const void *b, void *stream) {
+    tecoblasSetStream(desc->handle, desc->stream);
+    tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
+    tecoblasStatus_t status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+    sdaaStreamSynchronize(desc->stream);
+    if (status != TECOBLAS_STATUS_SUCCESS) {
+        return STATUS_EXECUTION_FAILED;
+    }else{
+        return STATUS_SUCCESS;
+    }
+}
+
+infiniopStatus_t tecoDestroyMatmulDescriptor(MatmulTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
new file mode 100644
index 00000000..b10c91ea
--- /dev/null
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -0,0 +1,42 @@
+#ifndef __TECO_MATMUL_H__
+#define __TECO_MATMUL_H__
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include "../../../devices/teco/teco_handle.h"
+struct MatmulTecoDescriptor {
+    Device device;
+    int device_id;
+    tecoblasHandle_t handle;
+    sdaaStream_t stream;
+    tecoblasOperation_t transa,transb;
+    int m,n,k;
+    float alpha,beta;
+    int lda,ldb,ldc;
+};
+
+typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle,
+                                             MatmulTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta);
+
+infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc,
+                                             uint64_t *size);
+
+infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t tecoDestroyMatmulDescriptor(MatmulTecoDescriptor_t desc);
+
+
+#endif
\ No newline at end of file
diff --git a/src/ops/utils.h b/src/ops/utils.h
index fd2afcf0..57e7ac31 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -4,6 +4,7 @@
 #include "data_type.h"
 #include "tensor.h"
 #include <algorithm>
+#include <functional>
 #include <numeric>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/wget-log b/wget-log
new file mode 100644
index 00000000..e6c76450
--- /dev/null
+++ b/wget-log
@@ -0,0 +1,11 @@
+--2024-11-25 07:56:20--  https://raw.githubusercontent.com/tboox/xmake/master/scripts/get.sh
+Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
+Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 8113 (7.9K) [text/plain]
+Saving to: 'STDOUT'
+
+-                         0%[                             ]       0  --.-KB/s               -                       100%[============================>]   7.92K  --.-KB/s    in 0.07s   
+
+2024-11-25 07:56:21 (119 KB/s) - written to stdout [8113/8113]
+
diff --git a/xmake.lua b/xmake.lua
index ce5e1172..fd579ae5 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -30,6 +30,14 @@ option("ascend-npu")
     add_defines("ENABLE_ASCEND_NPU")
 option_end()
 
+option("teco")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Teco kernel")
+    add_defines("ENABLE_TECO_SDAA")
+option_end()
+
+
 if is_mode("debug") then
     add_cxflags("-g -O0")
     add_defines("DEBUG_MODE")
@@ -115,7 +123,7 @@ if has_config("cambricon-mlu") then
             table.insert(target:objectfiles(), objectfile)
         end)
 
-rule_end()
+    rule_end()
 
 
     target("cambricon-mlu")
@@ -156,6 +164,53 @@ if has_config("ascend-npu") then
     target_end()
 end
 
+
+
+if has_config("teco") then
+
+    add_defines("ENABLE_TECO_SDAA")
+    add_includedirs("/opt/tecoai/include")
+    add_linkdirs("/opt/tecoai/lib64")
+    add_links("libsdaart.so")
+    add_links("libtecoblas.so")
+
+    rule("scpp")
+        set_extensions(".scpp")
+
+        on_load(function (target)
+            target:add("includedirs", "include")
+        end)
+
+        on_build_file(function (target, sourcefile)
+            local objectfile = target:objectfile(sourcefile)
+            os.mkdir(path.directory(objectfile))
+
+            local cc = "/opt/tecoai/bin/tecocc"
+
+            local includedirs = table.concat(target:get("includedirs"), " ")
+            local args = {sourcefile, "-o", objectfile}
+            
+            for _, includedir in ipairs(target:get("includedirs")) do
+                table.insert(args, "-I" .. includedir)
+            end
+
+            os.execv(cc, args)
+            table.insert(target:objectfiles(), objectfile)
+        end)
+
+    rule_end()
+
+
+    target("teco")
+        set_kind("static")
+        set_languages("cxx17")
+        add_files("src/devices/teco/*.cc", "src/ops/*/teco/*.cc")
+        add_files("src/ops/*/teco/*.scpp", {rule = "scpp"})
+        add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+    target_end()
+
+end
+
 target("operators")
     set_kind("shared")
 
@@ -171,6 +226,9 @@ target("operators")
     if has_config("ascend-npu") then
         add_deps("ascend-npu")
     end
+    if has_config("teco") then
+        add_deps("teco")
+    end
     set_languages("cxx17")
     add_files("src/devices/handle.cc")
     add_files("src/ops/*/operator.cc")

From 1eeefde3ae30384777983baab946f9aac68ef929 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 25 Dec 2024 15:55:08 +0800
Subject: [PATCH 04/15] update matmal

---
 src/ops/matmul/teco/matmul_tecoblas.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index cd7dcf39..ab94ef27 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -1,6 +1,7 @@
 #include "matmul_tecoblas.h"
 
 infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
+    //TODO:添加维度判断是否为batch
     *desc_ptr = new MatmulTecoDescriptor{handle->device};
     (*desc_ptr)->handle = handle->handle;
     (*desc_ptr)->device = handle->device;

From 0e564588b0223cc85797fcfbbeeb1fb987cc99de Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Thu, 26 Dec 2024 11:35:31 +0800
Subject: [PATCH 05/15] finishi matmul

---
 operatorspy/tests/matmul.py            |  2 -
 src/devices/teco/common_teco.cc        | 13 ++++++
 src/devices/teco/common_teco.h         |  3 +-
 src/ops/matmul/teco/matmul_tecoblas.cc | 65 +++++++++++++++++++-------
 src/ops/matmul/teco/matmul_tecoblas.h  |  2 +
 5 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index b84644f1..e2045acd 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -63,9 +63,7 @@ def test(
     if a_stride is not None:
         a = rearrange_tensor(a, a_stride)
     if b_stride is not None:
-        print(b)
         b = rearrange_tensor(b, b_stride)
-        print(b)
     if c_stride is not None:
         c = rearrange_tensor(c, c_stride)
     ans = matmul(c, beta, a, b, alpha)
diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
index 7724ba5c..0e22c0df 100644
--- a/src/devices/teco/common_teco.cc
+++ b/src/devices/teco/common_teco.cc
@@ -1 +1,14 @@
 #include "common_teco.h"
+void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize){
+    // Dynamically allocate memory for the output array of pointers
+    void** output = new void*[batch];
+
+    // Treat the void* data as a pointer to raw memory and use pointer arithmetic
+    for (int i = 0; i < batch; i++) {
+        // Output[i] will point to the i-th 2D slice (this is done in raw pointer arithmetic)
+        output[i] = static_cast<void*>(static_cast<char*>(data) + i * m * n * typeSize);
+    }
+
+    // Return the output array of pointers
+    return output;
+}
\ No newline at end of file
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
index 565d9841..157969e5 100644
--- a/src/devices/teco/common_teco.h
+++ b/src/devices/teco/common_teco.h
@@ -6,6 +6,7 @@
 #include <sdaa_runtime.h>
 #include <tecoblas.h>
 #include "device.h"
+#include <iostream>
 #define CHECK_TECOBLAS(expression)                                                               \
     {                                                                                            \
         tecoblasStatus_t status = (expression);                                                  \
@@ -14,6 +15,6 @@
             exit(EXIT_FAILURE);                                                                  \
         }                                                                                        \
     }
-
+void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize);
 
 #endif
\ No newline at end of file
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index ab94ef27..f448fd47 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -1,22 +1,49 @@
 #include "matmul_tecoblas.h"
 
 infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
-    //TODO:添加维度判断是否为batch
-    *desc_ptr = new MatmulTecoDescriptor{handle->device};
-    (*desc_ptr)->handle = handle->handle;
-    (*desc_ptr)->device = handle->device;
-    (*desc_ptr)->stream = handle->stream;
-    (*desc_ptr)->m = a_desc->shape[0];
-    (*desc_ptr)->k = a_desc->shape[1];
-    (*desc_ptr)->n = b_desc->shape[1];
-    (*desc_ptr)->transa = TECOBLAS_OP_N;
-    (*desc_ptr)->transb = TECOBLAS_OP_N;
-    (*desc_ptr)->lda = a_desc->shape[1];
-    (*desc_ptr)->ldb = b_desc->shape[1];
-    (*desc_ptr)->ldc = c_desc->shape[1];
-    (*desc_ptr)->alpha = 1.0f;
-    (*desc_ptr)->beta = 0.0f;
-    return STATUS_SUCCESS;
+    if (a_desc->ndim == 2 && b_desc->ndim == 2){
+        
+        *desc_ptr = new MatmulTecoDescriptor{handle->device};
+        (*desc_ptr)->batch = -1;
+        (*desc_ptr)->handle = handle->handle;
+        (*desc_ptr)->device = handle->device;
+        (*desc_ptr)->stream = handle->stream;
+        (*desc_ptr)->m = a_desc->shape[0];
+        (*desc_ptr)->k = a_desc->shape[1];
+        (*desc_ptr)->n = b_desc->shape[1];
+        (*desc_ptr)->transa = TECOBLAS_OP_N;
+        (*desc_ptr)->transb = TECOBLAS_OP_N;
+        (*desc_ptr)->lda = a_desc->strides[0];
+        (*desc_ptr)->ldb = b_desc->strides[0];
+        (*desc_ptr)->ldc = c_desc->strides[0];
+        (*desc_ptr)->alpha = 1.0f;
+        (*desc_ptr)->beta = 0.0f;
+        return STATUS_SUCCESS;
+    }
+    if (a_desc->ndim == 3 && b_desc->ndim == 3){
+        *desc_ptr = new MatmulTecoDescriptor{handle->device};
+        (*desc_ptr)->batch = a_desc->shape[0];
+        (*desc_ptr)->handle = handle->handle;
+        (*desc_ptr)->device = handle->device;
+        (*desc_ptr)->stream = handle->stream;
+        (*desc_ptr)->m = a_desc->shape[1];
+        (*desc_ptr)->k = a_desc->shape[2];
+        (*desc_ptr)->n = b_desc->shape[2];
+        (*desc_ptr)->transa = TECOBLAS_OP_N;
+        (*desc_ptr)->transb = TECOBLAS_OP_N;
+        (*desc_ptr)->lda = a_desc->strides[1];
+        (*desc_ptr)->ldb = b_desc->strides[1];
+        (*desc_ptr)->ldc = c_desc->strides[1];
+        (*desc_ptr)->strideA = a_desc->strides[0];
+        (*desc_ptr)->strideB = b_desc->strides[0];
+        (*desc_ptr)->strideC = c_desc->strides[0];
+        (*desc_ptr)->alpha = 1.0f;
+        (*desc_ptr)->beta = 0.0f;
+        return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_PARAM;
+
+    
 }
 
 infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
@@ -31,7 +58,11 @@ infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_
 infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, const void *a, const void *b, void *stream) {
     tecoblasSetStream(desc->handle, desc->stream);
     tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
-    tecoblasStatus_t status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+    tecoblasStatus_t status;
+    if(desc->batch<0)
+        status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+    else
+        status = tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch);
     sdaaStreamSynchronize(desc->stream);
     if (status != TECOBLAS_STATUS_SUCCESS) {
         return STATUS_EXECUTION_FAILED;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index b10c91ea..9aa0b665 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -13,6 +13,8 @@ struct MatmulTecoDescriptor {
     int m,n,k;
     float alpha,beta;
     int lda,ldb,ldc;
+    int batch;
+    long long int strideA,strideB,strideC;
 };
 
 typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;

From e23006fdf68c1bb390b2b5922f365e0ef6d546f6 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 8 Jan 2025 09:40:04 +0800
Subject: [PATCH 06/15] complete rms norm f16 weight

---
 include/infinirt.h                     | 78 +++++++++++++++++++++++
 include/tensor.h                       |  7 ---
 operatorspy/tests/mlp.py               | 33 ++++++++++
 operatorspy/tests/rms_norm.py          | 14 +++--
 src/devices/teco/common_teco.cc        | 10 ++-
 src/devices/teco/common_teco.h         |  4 ++
 src/devices/teco/tensor_teco.h         |  2 +
 src/ops/add/teco/add_tecodnn.cpp       | 18 ++++++
 src/ops/add/teco/add_tecodnn.h         | 40 ++++++++++++
 src/ops/matmul/teco/matmul_tecoblas.cc |  2 +-
 src/ops/rms_norm/operator.cc           | 31 +++++++++-
 src/ops/rms_norm/teco/rms_norm_teco.cc | 86 ++++++++++++++++++++++++++
 src/ops/rms_norm/teco/rms_norm_teco.h  | 35 +++++++++++
 xmake.lua                              |  1 +
 14 files changed, 347 insertions(+), 14 deletions(-)
 create mode 100644 include/infinirt.h
 create mode 100644 src/ops/add/teco/add_tecodnn.cpp
 create mode 100644 src/ops/add/teco/add_tecodnn.h
 create mode 100644 src/ops/rms_norm/teco/rms_norm_teco.cc
 create mode 100644 src/ops/rms_norm/teco/rms_norm_teco.h

diff --git a/include/infinirt.h b/include/infinirt.h
new file mode 100644
index 00000000..ee6d4d69
--- /dev/null
+++ b/include/infinirt.h
@@ -0,0 +1,78 @@
+#ifndef INFINI_RUNTIME_H
+#define INFINI_RUNTIME_H
+
+#if defined(_WIN32)
+#define __export __declspec(dllexport)
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define __export __attribute__((visibility("default")))
+#else
+#define __export
+#endif
+
+#ifdef __cplusplus
+#define __C extern "C"
+#else
+#define __C
+#endif
+#include <stddef.h>
+#include <stdint.h>
+
+typedef enum
+{
+    DEVICE_CPU,
+    DEVICE_NVIDIA,
+    DEVICE_CAMBRICON,
+    DEVICE_ASCEND,
+    DEVICE_TECO,
+} DeviceType;
+
+typedef enum
+{
+    INFINIRT_STATUS_SUCCESS = 0,
+    INFINIRT_STATUS_EXECUTION_FAILED = 1,
+    INFINIRT_STATUS_BAD_DEVICE = 2,
+    INFINIRT_STATUS_DEVICE_NOT_SUPPORTED = 3,
+    INFINIRT_STATUS_DEVICE_MISMATCH = 4,
+    INFINIRT_STATUS_INVALID_ARGUMENT = 5,
+    INFINIRT_STATUS_ILLEGAL_MEMORY_ACCESS = 6,
+    INFINIRT_STATUS_NOT_READY = 7,
+} infinirtStatus_t;
+
+__C __export infinirtStatus_t infinirtInit(DeviceType device);
+
+// Device
+__C __export infinirtStatus_t infinirtDeviceSynchronize(DeviceType device, uint32_t deviceId);
+
+// Stream
+struct infinirtStream;
+typedef struct infinirtStream *infinirtStream_t;
+#define INFINIRT_NULL_STREAM nullptr
+__C __export infinirtStatus_t infinirtStreamCreate(infinirtStream_t *pStream, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtStreamDestroy(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtStreamSynchronize(infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetRawStream(void** ptr, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtGetStreamDeviceInfo(DeviceType* deviceType, uint32_t *deviceId, infinirtStream_t stream);
+
+// Event
+struct infinirtEvent;
+typedef struct infinirtEvent *infinirtEvent_t;
+__C __export infinirtStatus_t infinirtEventCreate(infinirtEvent_t *pEvent, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtEventRecord(infinirtEvent_t event, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtEventQuery(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventSynchronize(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtEventDestroy(infinirtEvent_t event);
+__C __export infinirtStatus_t infinirtStreamWaitEvent(infinirtEvent_t event, infinirtStream_t stream);
+
+// Memory
+__C __export infinirtStatus_t infinirtMalloc(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMallocAsync(void **pMemory, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMallocHost(void **pMemory, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtFree(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtFreeAsync(void *ptr, DeviceType device, uint32_t deviceId, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtFreeHost(void *ptr, DeviceType device, uint32_t deviceId);
+__C __export infinirtStatus_t infinirtMemcpyH2D(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyH2DAsync(void *dst, DeviceType device, uint32_t deviceId, const void *src, size_t size, infinirtStream_t stream);
+__C __export infinirtStatus_t infinirtMemcpyD2H(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpy(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size);
+__C __export infinirtStatus_t infinirtMemcpyAsync(void *dst, const void* src, DeviceType device, uint32_t deviceId, size_t size, infinirtStream_t stream);
+#endif
diff --git a/include/tensor.h b/include/tensor.h
index bb9cfcd8..add3588d 100644
--- a/include/tensor.h
+++ b/include/tensor.h
@@ -17,12 +17,5 @@ struct TensorDescriptor {
 
 typedef struct TensorDescriptor *infiniopTensorDescriptor_t;
 
-// @depricated
-struct TensorTuple {
-    infiniopTensorDescriptor_t const layout;
-    void *data;
-};
-// @depricated
-typedef struct TensorTuple Tensor;
 
 #endif// __TENSOR_H__
diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py
index a3cf6d57..a8411f2e 100644
--- a/operatorspy/tests/mlp.py
+++ b/operatorspy/tests/mlp.py
@@ -223,6 +223,37 @@ def test_bang(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "sdaa",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+        )
+
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
@@ -278,4 +309,6 @@ def test_bang(lib, test_cases):
         test_bang(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang):
         test_cpu(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     print("Test passed!")
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
index d99dd95f..1c4eb412 100644
--- a/operatorspy/tests/rms_norm.py
+++ b/operatorspy/tests/rms_norm.py
@@ -77,10 +77,6 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
         )
     )
 
-    # print(ans)
-    # print("=======================================================")
-    # print(y)
-
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
     print("Test passed!")
@@ -107,6 +103,14 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+def test_sdaa(lib, test_cases):
+    import torch_sdaa
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "sdaa", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
@@ -153,5 +157,7 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
+    if args.teco:
+        test_sdaa(lib,test_cases)
     if not (args.cpu or args.cuda or args.bang):
         test_cpu(lib, test_cases)
diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
index 0e22c0df..da544887 100644
--- a/src/devices/teco/common_teco.cc
+++ b/src/devices/teco/common_teco.cc
@@ -11,4 +11,12 @@ void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize){
 
     // Return the output array of pointers
     return output;
-}
\ No newline at end of file
+}
+
+infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src, tecodnnTensorDescriptor_t des) {
+    tecodnnDataType_t data_type;
+    if(src->dt==F16)
+        data_type = TECODNN_DATA_HALF;
+    tecodnnSetTensor4dDescriptor(des,TECODNN_TENSOR_NCHW,data_type,src->shape[0],src->shape[1],1,1);
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
index 157969e5..75ec9a68 100644
--- a/src/devices/teco/common_teco.h
+++ b/src/devices/teco/common_teco.h
@@ -5,7 +5,9 @@
 #include <stdlib.h>
 #include <sdaa_runtime.h>
 #include <tecoblas.h>
+#include <tecodnn.h>
 #include "device.h"
+#include "operators.h"
 #include <iostream>
 #define CHECK_TECOBLAS(expression)                                                               \
     {                                                                                            \
@@ -17,4 +19,6 @@
     }
 void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize);
 
+infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src,tecodnnTensorDescriptor_t des);
+
 #endif
\ No newline at end of file
diff --git a/src/devices/teco/tensor_teco.h b/src/devices/teco/tensor_teco.h
index afb88e6d..906edd7b 100644
--- a/src/devices/teco/tensor_teco.h
+++ b/src/devices/teco/tensor_teco.h
@@ -12,6 +12,8 @@ struct tecoTensorDescriptor {
     int64_t *shape;
     int64_t *strides;
     tecoblasDataType_t data_type;
+    tecodnnDataType_t data_type;
+    tecodnnTensorDescriptor_t 
     infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
     infiniopStatus_t createTensor();
     infiniopStatus_t destroyTensor();
diff --git a/src/ops/add/teco/add_tecodnn.cpp b/src/ops/add/teco/add_tecodnn.cpp
new file mode 100644
index 00000000..0ca25c3d
--- /dev/null
+++ b/src/ops/add/teco/add_tecodnn.cpp
@@ -0,0 +1,18 @@
+#include "add_tecodnn.h"
+
+infiniopStatus_t tecoCreateAddDescriptor(TecoHandle_t handle, AddTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t add_teco(AddCpuDescriptor_t desc, void *c, void const *a, void const *b){
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoAdd(AddTecoDescriptor_t desc, void *c, const void *a, const void *b, void *stream) {
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroyAddDescriptor(AddTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/add/teco/add_tecodnn.h b/src/ops/add/teco/add_tecodnn.h
new file mode 100644
index 00000000..0b7a729c
--- /dev/null
+++ b/src/ops/add/teco/add_tecodnn.h
@@ -0,0 +1,40 @@
+#ifndef __TECO_ADD_H__
+#define __TECO_ADD_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecoblas.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+
+struct AddTecoDescriptor {
+    Device device;
+    int device_id;
+    tecodnnHandle_t handle;
+    sdaaStream_t stream;
+    tecoblasOperation_t transa,transb;
+    int m,n,k;
+    float alpha,beta;
+    int lda,ldb,ldc;
+    int batch;
+    long long int strideA,strideB,strideC;
+};
+
+typedef struct AddTecoDescriptor *AddTecoDescriptor_t;
+
+
+infiniopStatus_t tecoCreateAddDescriptor(TecoHandle_t handle,
+                                             AddTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t tecoAdd(AddTecoDescriptor_t desc,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t tecoDestroyAddDescriptor(AddTecoDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index f448fd47..a2dad615 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -31,7 +31,7 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
         (*desc_ptr)->n = b_desc->shape[2];
         (*desc_ptr)->transa = TECOBLAS_OP_N;
         (*desc_ptr)->transb = TECOBLAS_OP_N;
-        (*desc_ptr)->lda = a_desc->strides[1];
+        (*desc_ptr)->lda = a_desc->strides[1]; 
         (*desc_ptr)->ldb = b_desc->strides[1];
         (*desc_ptr)->ldc = c_desc->strides[1];
         (*desc_ptr)->strideA = a_desc->strides[0];
diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc
index 1af07fb2..1a07f2f7 100644
--- a/src/ops/rms_norm/operator.cc
+++ b/src/ops/rms_norm/operator.cc
@@ -15,6 +15,10 @@
 #include "bang/rms_norm_bang.h"
 #include "bang/rms_norm_cnnl.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/rms_norm_teco.h"
+#endif
+
 
 __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
     infiniopHandle_t handle,
@@ -37,6 +41,11 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
         case DevCambriconMlu: {
             //return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoCreateRMSNormDescriptor((TecoHandle_t) handle, (RMSNormTecoDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -58,7 +67,11 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
         case DevCambriconMlu: {
             //return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
         }
-
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoGetRMSNormWorkspaceSize((RMSNormTecoDescriptor_t) desc, size);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -82,6 +95,17 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
             //return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, data, stream);
         }
 
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoRMSNorm((RMSNormTecoDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               y,
+                               x,
+                               w,
+                               stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -104,6 +128,11 @@ __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_
             //return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
         }
 
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyRMSNormDescriptor((RMSNormTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.cc b/src/ops/rms_norm/teco/rms_norm_teco.cc
new file mode 100644
index 00000000..3e954d88
--- /dev/null
+++ b/src/ops/rms_norm/teco/rms_norm_teco.cc
@@ -0,0 +1,86 @@
+#include "rms_norm_teco.h"
+
+
+infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, infiniopTensorDescriptor_t w_desc, float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         c = y_desc->shape[1];
+    unsigned long h = 1,
+         w = 1;
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != c || w_desc->shape[0] != c) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    sdaaStream_t stream;
+    sdaaStreamCreate(&stream);
+    tecodnnTensorDescriptor_t x_desc_teco,y_desc_teco,w_desc_teco,rms_desc_teco;
+    tecodnnCreateTensorDescriptor(&x_desc_teco);
+    tecodnnCreateTensorDescriptor(&y_desc_teco);
+    tecodnnCreateTensorDescriptor(&w_desc_teco);
+    tecodnnCreateTensorDescriptor(&rms_desc_teco);
+    // toTecodnnTensorDescriptor(x_desc,x_desc_teco);
+    // toTecodnnTensorDescriptor(y_desc,y_desc_teco);
+    // toTecodnnTensorDescriptor(w_desc,w_desc_teco);
+    // tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,x_desc->shape[0],1,1,x_desc->shape[1]);
+    // tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,y_desc->shape[0],1,1,y_desc->shape[1]);
+    // if(w_desc->dt==F16)
+    //     tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,w_desc->shape[0]);
+    // if(w_desc->dt==F32)
+    //     tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,w_desc->shape[0]);
+    // tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,1,1,1);
+    
+    if(w_desc->dt==F16)
+        tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,c);
+        tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
+    if(w_desc->dt==F32)
+        tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
+        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,c);
+        tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
+    *desc_ptr = new RMSNormTecoDescriptor{
+        handle->device,
+        tecodnn_handle,
+        stream,
+        epsilon,
+        x_desc_teco,
+        y_desc_teco,
+        w_desc_teco,
+        rms_desc_teco,
+        n,
+        c,
+        };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint64_t *size) {
+    *size = (desc->n)*(desc->c)*32;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void *x, void *w, void *stream) {
+    tecodnnSetStream(desc->handle, desc->stream);
+    tecodnnStatus_t status;
+    
+    // void *rms = malloc(workspace_size * sizeof(uint16_t));
+    status = tecodnnRMSNormForward(desc->handle, desc->eps, desc->xDesc,x,desc->wDesc,w,desc->yDesc,y,desc->rmsDesc,workspace);
+    sdaaStreamSynchronize(desc->stream);
+    if (status != TECODNN_STATUS_SUCCESS) {
+        printf("%s\n",tecodnnGetErrorString(status));
+        return STATUS_EXECUTION_FAILED;
+    }else{
+        return STATUS_SUCCESS;
+    }
+}
+
+infiniopStatus_t tecoDestroyRMSNormDescriptor(RMSNormTecoDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.h b/src/ops/rms_norm/teco/rms_norm_teco.h
new file mode 100644
index 00000000..5969fb1a
--- /dev/null
+++ b/src/ops/rms_norm/teco/rms_norm_teco.h
@@ -0,0 +1,35 @@
+#ifndef __TECO_RMS_NORM_H__
+#define __TECO_RMS_NORM_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+
+struct RMSNormTecoDescriptor {
+    Device device;
+    tecodnnHandle_t handle;
+    sdaaStream_t stream;
+    float eps;
+    tecodnnTensorDescriptor_t xDesc,yDesc,wDesc,rmsDesc;
+    unsigned long n,c;
+};
+
+typedef struct RMSNormTecoDescriptor *RMSNormTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc, float epsilon);
+
+infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *y, void *x, void *w, 
+                                  void *stream);
+
+infiniopStatus_t tecoDestroyRMSNormDescriptor(RMSNormTecoDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index fd579ae5..cbc9df59 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -173,6 +173,7 @@ if has_config("teco") then
     add_linkdirs("/opt/tecoai/lib64")
     add_links("libsdaart.so")
     add_links("libtecoblas.so")
+    add_links("libtecodnn.so")
 
     rule("scpp")
         set_extensions(".scpp")

From a8025d1b25b64923452ba2f257e8c9a49886e161 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 8 Jan 2025 11:01:20 +0800
Subject: [PATCH 07/15] solve conflicts

---
 include/tensor.h | 4 ----
 src/ops/utils.h  | 3 ---
 xmake.lua        | 4 ----
 3 files changed, 11 deletions(-)

diff --git a/include/tensor.h b/include/tensor.h
index fb9bd400..3cc28922 100644
--- a/include/tensor.h
+++ b/include/tensor.h
@@ -17,8 +17,4 @@ struct TensorDescriptor {
 
 typedef struct TensorDescriptor *infiniopTensorDescriptor_t;
 
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/dev
 #endif// __TENSOR_H__
diff --git a/src/ops/utils.h b/src/ops/utils.h
index 536198fb..f0e64fee 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -4,11 +4,8 @@
 #include "data_type.h"
 #include "tensor.h"
 #include <algorithm>
-<<<<<<< HEAD
 #include <functional>
-=======
 #include <iostream>
->>>>>>> upstream/dev
 #include <numeric>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/xmake.lua b/xmake.lua
index 25b1460c..a353d807 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -224,7 +224,6 @@ if has_config("ascend-npu") then
     target_end()
 end
 
-<<<<<<< HEAD
 
 
 if has_config("teco") then
@@ -273,10 +272,7 @@ if has_config("teco") then
 
 end
 
-target("operators")
-=======
 target("infiniop")
->>>>>>> upstream/dev
     set_kind("shared")
 
     if has_config("cpu") then

From b143a6683f7ca97b8255dad8e7f54027d3524cd6 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 8 Jan 2025 11:02:52 +0800
Subject: [PATCH 08/15] solve conflicts too

---
 xmake.lua | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/xmake.lua b/xmake.lua
index a353d807..942edb58 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -150,10 +150,6 @@ if has_config("cambricon-mlu") then
         end)
 
     rule_end()
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/dev
 
     target("cambricon-mlu")
         set_kind("static")
@@ -224,8 +220,6 @@ if has_config("ascend-npu") then
     target_end()
 end
 
-
-
 if has_config("teco") then
 
     add_defines("ENABLE_TECO_SDAA")

From 5f611029e61eadbf759b0b4c7ad811289e957253 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 8 Jan 2025 23:58:15 +0800
Subject: [PATCH 09/15] update matmul

---
 operatorspy/liboperators.py            |   4 -
 operatorspy/tests/matmul.py            |   9 +-
 operatorspy/tests/rms_norm.py          |   8 --
 src/devices/handle.cc                  |   1 -
 src/devices/teco/teco_handle.cc        |  11 +--
 src/devices/teco/teco_handle.h         |   1 -
 src/ops/matmul/teco/matmul_tecoblas.cc | 114 +++++++++++++++----------
 src/ops/matmul/teco/matmul_tecoblas.h  |   9 +-
 src/ops/rms_norm/operator.cc           |   3 -
 src/ops/rms_norm/teco/rms_norm_teco.cc |  18 ++--
 src/ops/rms_norm/teco/rms_norm_teco.h  |   2 +-
 11 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index 057b92e6..838bec17 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -53,12 +53,8 @@ def find_library_in_ld_path(library_name):
     if system_name == "Windows":
         library_path = find_library_in_ld_path("infiniop.dll")
     elif system_name == "Linux":
-<<<<<<< HEAD
-        library_path = find_library_in_ld_path("liboperators.so")
-=======
         library_path = find_library_in_ld_path("libinfiniop.so")
 
->>>>>>> upstream/dev
     assert (
         library_path is not None
     ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index 179823f8..aacb5b46 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -84,12 +84,8 @@ def test(
         b = rearrange_tensor(b, b_stride)
     if c_stride is not None:
         c = rearrange_tensor(c, c_stride)
-<<<<<<< HEAD
     ans = matmul(c, beta, a, b, alpha)
     
-=======
-
->>>>>>> upstream/dev
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
     c_tensor = to_tensor(c, lib)
@@ -162,6 +158,7 @@ def test(
         print(f"    lib time: {elapsed :6f}")
 
     check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
+    print("Test passed!")
 
 
 def test_cpu(lib, test_cases):
@@ -391,10 +388,6 @@ def test_sdaa(lib, test_cases):
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
         test_cpu(lib, test_cases)
-<<<<<<< HEAD
     if args.teco:
         test_sdaa(lib,test_cases)
     print("Test passed!")
-=======
-    print("\033[92mTest passed!\033[0m")
->>>>>>> upstream/dev
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
index faf633d5..53d774a1 100644
--- a/operatorspy/tests/rms_norm.py
+++ b/operatorspy/tests/rms_norm.py
@@ -77,7 +77,6 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
             None,
         )
     )
-
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
     print("Test passed!")
@@ -104,7 +103,6 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
-<<<<<<< HEAD
 def test_sdaa(lib, test_cases):
     import torch_sdaa
     device = DeviceEnum.DEVICE_TECO
@@ -113,7 +111,6 @@ def test_sdaa(lib, test_cases):
         test(lib, handle, "sdaa", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
-=======
 def test_ascend(lib, test_cases):
     import torch_npu
     device = DeviceEnum.DEVICE_ASCEND
@@ -122,7 +119,6 @@ def test_ascend(lib, test_cases):
         test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
 
     destroy_handle(lib, handle)
->>>>>>> upstream/dev
 
 if __name__ == "__main__":
     test_cases = [
@@ -169,13 +165,9 @@ def test_ascend(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-<<<<<<< HEAD
     if args.teco:
         test_sdaa(lib,test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-=======
     if args.ascend:
         test_ascend(lib, test_cases)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
->>>>>>> upstream/dev
         test_cpu(lib, test_cases)
diff --git a/src/devices/handle.cc b/src/devices/handle.cc
index 6f726bcf..ef56b2f8 100644
--- a/src/devices/handle.cc
+++ b/src/devices/handle.cc
@@ -15,7 +15,6 @@
 #include "./teco/teco_handle.h"
 #endif
 
-
 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
     if (handle_ptr == nullptr) {
         return STATUS_MEMORY_NOT_ALLOCATED;
diff --git a/src/devices/teco/teco_handle.cc b/src/devices/teco/teco_handle.cc
index ea3694f1..c816808d 100644
--- a/src/devices/teco/teco_handle.cc
+++ b/src/devices/teco/teco_handle.cc
@@ -8,18 +8,15 @@ infiniopStatus_t createTecoHandle(TecoHandle_t *handle_ptr, int device_id) {
     }
 
     sdaaSetDevice(device_id);
-
-    *handle_ptr = new TecoContext{DevTecoSDAA, device_id};
-    tecoblasCreate(&(*handle_ptr)->handle);
-    sdaaStreamCreate(&(*handle_ptr)->stream);
-    tecoblasSetStream((*handle_ptr)->handle,(*handle_ptr)->stream);
-
+    sdaaStream_t stream;
+    sdaaStreamCreate(&stream);
+    *handle_ptr = new TecoContext{DevTecoSDAA, device_id,stream};
+    
     return STATUS_SUCCESS;
 }
 
 infiniopStatus_t deleteTecoHandle(TecoHandle_t handle_ptr) {
     sdaaStreamDestroy(handle_ptr->stream);
-    tecoblasDestroy(handle_ptr->handle);
     delete handle_ptr;
     return STATUS_SUCCESS;
 }
diff --git a/src/devices/teco/teco_handle.h b/src/devices/teco/teco_handle.h
index 583534ec..43b0794e 100644
--- a/src/devices/teco/teco_handle.h
+++ b/src/devices/teco/teco_handle.h
@@ -6,7 +6,6 @@
 struct TecoContext {
     Device device;
     int device_id;
-    tecoblasHandle_t handle;
     sdaaStream_t stream;
 };
 typedef struct TecoContext *TecoHandle_t;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index a2dad615..98000ff8 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -1,53 +1,72 @@
 #include "matmul_tecoblas.h"
 
 infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
-    if (a_desc->ndim == 2 && b_desc->ndim == 2){
-        
-        *desc_ptr = new MatmulTecoDescriptor{handle->device};
-        (*desc_ptr)->batch = -1;
-        (*desc_ptr)->handle = handle->handle;
-        (*desc_ptr)->device = handle->device;
-        (*desc_ptr)->stream = handle->stream;
-        (*desc_ptr)->m = a_desc->shape[0];
-        (*desc_ptr)->k = a_desc->shape[1];
-        (*desc_ptr)->n = b_desc->shape[1];
-        (*desc_ptr)->transa = TECOBLAS_OP_N;
-        (*desc_ptr)->transb = TECOBLAS_OP_N;
-        (*desc_ptr)->lda = a_desc->strides[0];
-        (*desc_ptr)->ldb = b_desc->strides[0];
-        (*desc_ptr)->ldc = c_desc->strides[0];
-        (*desc_ptr)->alpha = 1.0f;
-        (*desc_ptr)->beta = 0.0f;
-        return STATUS_SUCCESS;
+    long long int batch,batch_count;
+    if (a_desc->ndim == 2 && b_desc->ndim == 2) {
+        batch = 0;
+        batch_count = 1;
+    }else if(a_desc->ndim == 3 && b_desc->ndim == 3){
+        batch = 1;
+        batch_count = a_desc->shape[0];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE;
     }
-    if (a_desc->ndim == 3 && b_desc->ndim == 3){
-        *desc_ptr = new MatmulTecoDescriptor{handle->device};
-        (*desc_ptr)->batch = a_desc->shape[0];
-        (*desc_ptr)->handle = handle->handle;
-        (*desc_ptr)->device = handle->device;
-        (*desc_ptr)->stream = handle->stream;
-        (*desc_ptr)->m = a_desc->shape[1];
-        (*desc_ptr)->k = a_desc->shape[2];
-        (*desc_ptr)->n = b_desc->shape[2];
-        (*desc_ptr)->transa = TECOBLAS_OP_N;
-        (*desc_ptr)->transb = TECOBLAS_OP_N;
-        (*desc_ptr)->lda = a_desc->strides[1]; 
-        (*desc_ptr)->ldb = b_desc->strides[1];
-        (*desc_ptr)->ldc = c_desc->strides[1];
-        (*desc_ptr)->strideA = a_desc->strides[0];
-        (*desc_ptr)->strideB = b_desc->strides[0];
-        (*desc_ptr)->strideC = c_desc->strides[0];
-        (*desc_ptr)->alpha = 1.0f;
-        (*desc_ptr)->beta = 0.0f;
-        return STATUS_SUCCESS;
+
+    tecoblasDataType_t datatype;
+    if(a_desc->dt==F16 && b_desc->dt==F16){
+        datatype = TECOBLAS_DATA_HALF;
+    }else if(a_desc->dt==F32 && b_desc->dt==F32){
+        datatype = TECOBLAS_DATA_FLOAT;
+    }else{
+        return STATUS_BAD_TENSOR_DTYPE;
     }
-    return STATUS_BAD_PARAM;
 
+    tecoblasHandle_t tecoblas_handle;
+    tecoblasCreate(&tecoblas_handle);
+    // sdaaStream_t stream;
     
+
+    *desc_ptr = new MatmulTecoDescriptor{
+        handle->device,
+        handle->device_id,
+        tecoblas_handle,
+        handle->stream,
+        datatype,
+        TECOBLAS_OP_N,
+        TECOBLAS_OP_N,
+        a_desc->shape[0+batch],
+        a_desc->shape[1+batch],
+        b_desc->shape[1+batch],
+        1.0f,
+        0.0f,
+        a_desc->strides[0+batch],
+        b_desc->strides[0+batch],
+        c_desc->strides[0+batch],
+        batch,
+        batch_count,
+        a_desc->strides[0],
+        b_desc->strides[0],
+        c_desc->strides[0],
+        };
+    tecoblasSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
+        
+    return STATUS_SUCCESS;
 }
 
 infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
-    tecoblasStatus_t status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, 1, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size));
+    tecoblasStatus_t status;
+    if(desc->batch==0){
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size));
+        else
+            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->ldb, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size));
+    }else{
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size));
+        else
+            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size));
+    }
+    
     if (status != TECOBLAS_STATUS_SUCCESS) {
         return STATUS_EXECUTION_FAILED;
     }else{
@@ -59,10 +78,17 @@ infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64
     tecoblasSetStream(desc->handle, desc->stream);
     tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
     tecoblasStatus_t status;
-    if(desc->batch<0)
-        status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
-    else
-        status = tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch);
+    if(desc->batch==0){
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+        else
+            status = tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+    }else{
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            status = tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch_count);
+        else
+            status = tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch_count);
+    }
     sdaaStreamSynchronize(desc->stream);
     if (status != TECOBLAS_STATUS_SUCCESS) {
         return STATUS_EXECUTION_FAILED;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index 9aa0b665..b43cc3a9 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -9,12 +9,13 @@ struct MatmulTecoDescriptor {
     int device_id;
     tecoblasHandle_t handle;
     sdaaStream_t stream;
+    tecoblasDataType_t datatype;
     tecoblasOperation_t transa,transb;
-    int m,n,k;
+    uint64_t m,n,k;
     float alpha,beta;
-    int lda,ldb,ldc;
-    int batch;
-    long long int strideA,strideB,strideC;
+    long long int lda,ldb,ldc;
+    long long int batch,batch_count;
+    long int strideA,strideB,strideC;
 };
 
 typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;
diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc
index 6557bf61..4cdac453 100644
--- a/src/ops/rms_norm/operator.cc
+++ b/src/ops/rms_norm/operator.cc
@@ -85,14 +85,11 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
             return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
                                                 size);
         }
-<<<<<<< HEAD
 #endif
 #ifdef ENABLE_TECO_SDAA
         case DevTecoSDAA: {
             return tecoGetRMSNormWorkspaceSize((RMSNormTecoDescriptor_t) desc, size);
         }
-=======
->>>>>>> upstream/dev
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.cc b/src/ops/rms_norm/teco/rms_norm_teco.cc
index 3e954d88..b13b9fa2 100644
--- a/src/ops/rms_norm/teco/rms_norm_teco.cc
+++ b/src/ops/rms_norm/teco/rms_norm_teco.cc
@@ -17,8 +17,8 @@ infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDes
 
     tecodnnHandle_t tecodnn_handle;
     tecodnnCreate(&tecodnn_handle);
-    sdaaStream_t stream;
-    sdaaStreamCreate(&stream);
+    // sdaaStream_t stream;
+    // sdaaStreamCreate(&stream);
     tecodnnTensorDescriptor_t x_desc_teco,y_desc_teco,w_desc_teco,rms_desc_teco;
     tecodnnCreateTensorDescriptor(&x_desc_teco);
     tecodnnCreateTensorDescriptor(&y_desc_teco);
@@ -35,20 +35,23 @@ infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDes
     //     tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,w_desc->shape[0]);
     // tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,1,1,1);
     
-    if(w_desc->dt==F16)
+    if(w_desc->dt==F16){
         tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
         tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
         tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,c);
         tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
-    if(w_desc->dt==F32)
+    }
+        
+    if(w_desc->dt==F32){
         tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
         tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
         tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,c);
         tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
+    }
     *desc_ptr = new RMSNormTecoDescriptor{
         handle->device,
         tecodnn_handle,
-        stream,
+        handle->stream,
         epsilon,
         x_desc_teco,
         y_desc_teco,
@@ -57,15 +60,16 @@ infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDes
         n,
         c,
         };
+    tecodnnSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
     return STATUS_SUCCESS;
 }
 
 infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint64_t *size) {
-    *size = (desc->n)*(desc->c)*32;
+    *size = (desc->n)*(desc->c)*4;
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void *x, void *w, void *stream) {
+infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream) {
     tecodnnSetStream(desc->handle, desc->stream);
     tecodnnStatus_t status;
     
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.h b/src/ops/rms_norm/teco/rms_norm_teco.h
index 5969fb1a..28a2de32 100644
--- a/src/ops/rms_norm/teco/rms_norm_teco.h
+++ b/src/ops/rms_norm/teco/rms_norm_teco.h
@@ -27,7 +27,7 @@ infiniopStatus_t tecoGetRMSNormWorkspaceSize(RMSNormTecoDescriptor_t desc, uint6
 infiniopStatus_t tecoRMSNorm(RMSNormTecoDescriptor_t desc,
                                   void *workspace,
                                   uint64_t workspace_size,
-                                  void *y, void *x, void *w, 
+                                  void *y, void const *x, void const *w, 
                                   void *stream);
 
 infiniopStatus_t tecoDestroyRMSNormDescriptor(RMSNormTecoDescriptor_t desc);

From 9abc10fc3b6d99461644bc43a36701e111950bec Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Fri, 10 Jan 2025 22:45:20 +0800
Subject: [PATCH 10/15] complete matmul on new dev

---
 operatorspy/tests/matmul.py            |  2 +-
 src/devices/teco/common_teco.cc        | 93 ++++++++++++++++++++++++--
 src/devices/teco/common_teco.h         |  5 +-
 src/ops/matmul/teco/matmul_tecoblas.cc | 88 ++++++++++++++----------
 src/ops/matmul/teco/matmul_tecoblas.h  |  4 +-
 src/ops/rms_norm/teco/rms_norm_teco.cc |  2 +-
 6 files changed, 149 insertions(+), 45 deletions(-)

diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index aacb5b46..51dc1d9c 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -101,7 +101,7 @@ def test(
             beta
         )
     )
-
+    print(a.stride(),b.stride(),c.stride())
     workspace_size = c_uint64(0)
     check_error(
         lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
index da544887..f3a136ea 100644
--- a/src/devices/teco/common_teco.cc
+++ b/src/devices/teco/common_teco.cc
@@ -1,18 +1,95 @@
 #include "common_teco.h"
-void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize){
-    // Dynamically allocate memory for the output array of pointers
-    void** output = new void*[batch];
+void const** convertToBatch(void const* data, int batch, int stride, size_t typeSize){
+    void const **output = (void const **)malloc(batch * sizeof(void const *));
+    if (output == NULL) {
+        return NULL;
+    }
+
+    const uint8_t *charData = (const uint8_t *)data;
 
-    // Treat the void* data as a pointer to raw memory and use pointer arithmetic
     for (int i = 0; i < batch; i++) {
-        // Output[i] will point to the i-th 2D slice (this is done in raw pointer arithmetic)
-        output[i] = static_cast<void*>(static_cast<char*>(data) + i * m * n * typeSize);
+        output[i] = (const void *)(charData + i * stride * typeSize);
     }
 
-    // Return the output array of pointers
     return output;
 }
 
+bool is_contiguous(infiniopTensorDescriptor_t desc) {
+    uint64_t ndim = desc->ndim;
+    if (desc->strides[ndim-1] != 1) {
+        return false;
+    }else
+        return true;  
+}
+
+infiniopStatus_t restoreTensor(infiniopTensorDescriptor_t desc, void *data,tecodnnDataType_t datatype) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    tecodnnTensorDescriptor_t src,dst;
+    tecodnnCreateTensorDescriptor(&src);
+    tecodnnCreateTensorDescriptor(&dst);
+    int *strides = new int[desc->ndim];
+    int *old_strides = new int[desc->ndim];
+    int *shape = new int[desc->ndim];
+    strides[desc->ndim - 1] = 1;  // 最后一维的 stride 为 1
+    old_strides[desc->ndim - 1] = desc->strides[desc->ndim - 1];
+    shape[desc->ndim - 1] = desc->shape[desc->ndim - 1];
+    for (int i = desc->ndim - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * desc->shape[i + 1];  // 当前维度的 stride
+        shape[i] = desc->shape[i];
+        old_strides[i] = desc->strides[i];
+    }
+    size_t size = strides[0]*desc->shape[0];
+    if(datatype==TECODNN_DATA_HALF)
+        size*=sizeof(uint16_t);
+    else
+        size*=sizeof(uint32_t);
+    void *temp;
+    sdaaMalloc(&temp,size);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc->ndim,shape,strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc->ndim,shape,old_strides);
+    tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
+    sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
+    sdaaFree(temp);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t toContiguous(infiniopTensorDescriptor_t desc, void *data,tecodnnDataType_t datatype) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+    tecodnnTensorDescriptor_t src,dst;
+    tecodnnCreateTensorDescriptor(&src);
+    tecodnnCreateTensorDescriptor(&dst);
+    int *strides = new int[desc->ndim];
+    int *old_strides = new int[desc->ndim];
+    int *shape = new int[desc->ndim];
+    strides[desc->ndim - 1] = 1; 
+    old_strides[desc->ndim - 1] = desc->strides[desc->ndim - 1];
+    shape[desc->ndim - 1] = desc->shape[desc->ndim - 1];
+    for (int i = desc->ndim - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * desc->shape[i + 1]; 
+        shape[i] = desc->shape[i];
+        old_strides[i] = desc->strides[i];
+    }
+    size_t size = strides[0]*desc->shape[0];
+    if(datatype==TECODNN_DATA_HALF){
+        size*=sizeof(uint16_t);
+    }
+    else{
+        size*=sizeof(uint32_t);
+    }
+    void *temp;
+    sdaaMalloc(&temp,size);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc->ndim,shape,old_strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc->ndim,shape,strides);
+    tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
+    sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
+    sdaaFree(temp);
+
+    return STATUS_SUCCESS;
+}
+
 infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src, tecodnnTensorDescriptor_t des) {
     tecodnnDataType_t data_type;
     if(src->dt==F16)
@@ -20,3 +97,5 @@ infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src, tecod
     tecodnnSetTensor4dDescriptor(des,TECODNN_TENSOR_NCHW,data_type,src->shape[0],src->shape[1],1,1);
     return STATUS_SUCCESS;
 }
+
+
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
index 75ec9a68..84d7c7fa 100644
--- a/src/devices/teco/common_teco.h
+++ b/src/devices/teco/common_teco.h
@@ -17,7 +17,10 @@
             exit(EXIT_FAILURE);                                                                  \
         }                                                                                        \
     }
-void** convertToBatch(void* data, int batch, int m, int n, size_t typeSize);
+void const** convertToBatch(void const* data, int batch, int stride, size_t typeSize);
+bool is_contiguous(infiniopTensorDescriptor_t desc);
+infiniopStatus_t toContiguous(infiniopTensorDescriptor_t desc,void *data,tecodnnDataType_t datatype);
+infiniopStatus_t restoreTensor(infiniopTensorDescriptor_t desc,void *data,tecodnnDataType_t datatype);
 
 infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src,tecodnnTensorDescriptor_t des);
 
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index 98000ff8..a1bfc5e3 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -23,7 +23,6 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
 
     tecoblasHandle_t tecoblas_handle;
     tecoblasCreate(&tecoblas_handle);
-    // sdaaStream_t stream;
     
 
     *desc_ptr = new MatmulTecoDescriptor{
@@ -37,8 +36,8 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
         a_desc->shape[0+batch],
         a_desc->shape[1+batch],
         b_desc->shape[1+batch],
-        1.0f,
-        0.0f,
+        alpha,
+        beta,
         a_desc->strides[0+batch],
         b_desc->strides[0+batch],
         c_desc->strides[0+batch],
@@ -47,6 +46,9 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
         a_desc->strides[0],
         b_desc->strides[0],
         c_desc->strides[0],
+        a_desc,
+        b_desc,
+        c_desc,
         };
     tecoblasSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
         
@@ -54,47 +56,65 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
 }
 
 infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
-    tecoblasStatus_t status;
-    if(desc->batch==0){
-        if(desc->datatype == TECOBLAS_DATA_HALF)
-            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size));
-        else
-            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->ldb, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size));
+    if(is_contiguous(desc->a_desc) && is_contiguous(desc->b_desc) && is_contiguous(desc->c_desc)){
+        if(desc->batch==0){
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size)))
+            else
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->ldb, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size)))
+        }else{
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
+            else
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
+        }
     }else{
-        if(desc->datatype == TECOBLAS_DATA_HALF)
-            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size));
-        else
-            status = tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size));
-    }
-    
-    if (status != TECOBLAS_STATUS_SUCCESS) {
-        return STATUS_EXECUTION_FAILED;
-    }else{
-        return STATUS_SUCCESS;
+        printf("some tensor is not contiguous\n");
+        if(desc->batch==0){
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->n, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size)))
+            else
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->n, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size)))
+        }else{
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->n, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
+            else
+                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->n, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
+        }
     }
+        
+    return STATUS_SUCCESS;
 }
 
 infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, const void *a, const void *b, void *stream) {
     tecoblasSetStream(desc->handle, desc->stream);
     tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
-    tecoblasStatus_t status;
-    if(desc->batch==0){
-        if(desc->datatype == TECOBLAS_DATA_HALF)
-            status = tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
-        else
-            status = tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda, b, desc->ldb, 0.0f, c, desc->ldc);
+    if(is_contiguous(desc->a_desc)  && is_contiguous(desc->b_desc) && is_contiguous(desc->c_desc)){
+        if(desc->batch==0){
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
+            else
+                CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
+        }else{
+            if(desc->datatype == TECOBLAS_DATA_HALF)
+                CHECK_TECOBLAS(tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
+            else
+                CHECK_TECOBLAS(tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
+        }
     }else{
-        if(desc->datatype == TECOBLAS_DATA_HALF)
-            status = tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch_count);
-        else
-            status = tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, 1.0f, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, 0.0f, c, desc->ldc,desc->strideC,desc->batch_count);
+        if(desc->datatype == TECOBLAS_DATA_HALF){
+            toContiguous(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_HALF);
+            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->n, desc->beta, c, desc->ldc))
+            restoreTensor(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_HALF);
+        }else{
+            toContiguous(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_FLOAT);
+            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->n, desc->beta, c, desc->ldc))
+            restoreTensor(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_FLOAT);
+        }
+
     }
     sdaaStreamSynchronize(desc->stream);
-    if (status != TECOBLAS_STATUS_SUCCESS) {
-        return STATUS_EXECUTION_FAILED;
-    }else{
-        return STATUS_SUCCESS;
-    }
+    return STATUS_SUCCESS;
 }
 
 infiniopStatus_t tecoDestroyMatmulDescriptor(MatmulTecoDescriptor_t desc) {
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index b43cc3a9..9392f27f 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -3,6 +3,7 @@
 #include "operators.h"
 #include <sdaa_runtime.h>
 #include <tecoblas.h>
+#include <tecodnn.h>
 #include "../../../devices/teco/teco_handle.h"
 struct MatmulTecoDescriptor {
     Device device;
@@ -11,11 +12,12 @@ struct MatmulTecoDescriptor {
     sdaaStream_t stream;
     tecoblasDataType_t datatype;
     tecoblasOperation_t transa,transb;
-    uint64_t m,n,k;
+    uint64_t m,k,n;
     float alpha,beta;
     long long int lda,ldb,ldc;
     long long int batch,batch_count;
     long int strideA,strideB,strideC;
+    infiniopTensorDescriptor_t a_desc,b_desc,c_desc;
 };
 
 typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;
diff --git a/src/ops/rms_norm/teco/rms_norm_teco.cc b/src/ops/rms_norm/teco/rms_norm_teco.cc
index b13b9fa2..feb2869f 100644
--- a/src/ops/rms_norm/teco/rms_norm_teco.cc
+++ b/src/ops/rms_norm/teco/rms_norm_teco.cc
@@ -45,7 +45,7 @@ infiniopStatus_t tecoCreateRMSNormDescriptor(TecoHandle_t handle, RMSNormTecoDes
     if(w_desc->dt==F32){
         tecodnnSetTensor4dDescriptor(x_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
         tecodnnSetTensor4dDescriptor(y_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,n,h,w,c);
-        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,1,1,1,c);
+        tecodnnSetTensor4dDescriptor(w_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,1,1,1,c);
         tecodnnSetTensor4dDescriptor(rms_desc_teco,TECODNN_TENSOR_NCHW,TECODNN_DATA_FLOAT,n,h,w,1);
     }
     *desc_ptr = new RMSNormTecoDescriptor{

From c09e3335dab7c64a0988a7480c559a7b2dacec7e Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Mon, 13 Jan 2025 09:56:16 +0800
Subject: [PATCH 11/15] refactor matmul

---
 src/devices/teco/common_teco.cc        | 61 ++++++++++++--------------
 src/devices/teco/common_teco.h         | 44 +++++++++++++++++--
 src/ops/matmul/teco/matmul_tecoblas.cc |  9 ++--
 src/ops/matmul/teco/matmul_tecoblas.h  |  2 +-
 4 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/src/devices/teco/common_teco.cc b/src/devices/teco/common_teco.cc
index f3a136ea..2de5722d 100644
--- a/src/devices/teco/common_teco.cc
+++ b/src/devices/teco/common_teco.cc
@@ -14,40 +14,37 @@ void const** convertToBatch(void const* data, int batch, int stride, size_t type
     return output;
 }
 
-bool is_contiguous(infiniopTensorDescriptor_t desc) {
-    uint64_t ndim = desc->ndim;
-    if (desc->strides[ndim-1] != 1) {
+bool is_contiguous(MatrixInfo desc) {
+    if (desc.ei!= 1) {
         return false;
     }else
         return true;  
 }
 
-infiniopStatus_t restoreTensor(infiniopTensorDescriptor_t desc, void *data,tecodnnDataType_t datatype) {
+infiniopStatus_t restoreTensor(MatrixInfo desc, void *data,tecodnnDataType_t datatype) {
     tecodnnHandle_t tecodnn_handle;
     tecodnnCreate(&tecodnn_handle);
     tecodnnTensorDescriptor_t src,dst;
     tecodnnCreateTensorDescriptor(&src);
     tecodnnCreateTensorDescriptor(&dst);
-    int *strides = new int[desc->ndim];
-    int *old_strides = new int[desc->ndim];
-    int *shape = new int[desc->ndim];
-    strides[desc->ndim - 1] = 1;  // 最后一维的 stride 为 1
-    old_strides[desc->ndim - 1] = desc->strides[desc->ndim - 1];
-    shape[desc->ndim - 1] = desc->shape[desc->ndim - 1];
-    for (int i = desc->ndim - 2; i >= 0; --i) {
-        strides[i] = strides[i + 1] * desc->shape[i + 1];  // 当前维度的 stride
-        shape[i] = desc->shape[i];
-        old_strides[i] = desc->strides[i];
-    }
-    size_t size = strides[0]*desc->shape[0];
+    int *dst_strides = new int[desc.ndim];
+    int *src_strides = new int[desc.ndim];
+    int *shape = new int[desc.ndim];
+    dst_strides[0] = desc.cols;
+    dst_strides[1] = 1; 
+    src_strides[0] = desc.ld;
+    src_strides[1] = desc.ei;
+    shape[0] = desc.rows;
+    shape[1] = desc.cols;
+    size_t size = shape[1]*shape[0];
     if(datatype==TECODNN_DATA_HALF)
         size*=sizeof(uint16_t);
     else
         size*=sizeof(uint32_t);
     void *temp;
     sdaaMalloc(&temp,size);
-    tecodnnSetTensorNdDescriptor(src,datatype,desc->ndim,shape,strides);
-    tecodnnSetTensorNdDescriptor(dst,datatype,desc->ndim,shape,old_strides);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc.ndim,shape,dst_strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc.ndim,shape,src_strides);
     tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
     sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
     sdaaFree(temp);
@@ -55,24 +52,22 @@ infiniopStatus_t restoreTensor(infiniopTensorDescriptor_t desc, void *data,tecod
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t toContiguous(infiniopTensorDescriptor_t desc, void *data,tecodnnDataType_t datatype) {
+infiniopStatus_t toContiguous(MatrixInfo desc, void *data,tecodnnDataType_t datatype) {
     tecodnnHandle_t tecodnn_handle;
     tecodnnCreate(&tecodnn_handle);
     tecodnnTensorDescriptor_t src,dst;
     tecodnnCreateTensorDescriptor(&src);
     tecodnnCreateTensorDescriptor(&dst);
-    int *strides = new int[desc->ndim];
-    int *old_strides = new int[desc->ndim];
-    int *shape = new int[desc->ndim];
-    strides[desc->ndim - 1] = 1; 
-    old_strides[desc->ndim - 1] = desc->strides[desc->ndim - 1];
-    shape[desc->ndim - 1] = desc->shape[desc->ndim - 1];
-    for (int i = desc->ndim - 2; i >= 0; --i) {
-        strides[i] = strides[i + 1] * desc->shape[i + 1]; 
-        shape[i] = desc->shape[i];
-        old_strides[i] = desc->strides[i];
-    }
-    size_t size = strides[0]*desc->shape[0];
+    int *dst_strides = new int[desc.ndim];
+    int *src_strides = new int[desc.ndim];
+    int *shape = new int[desc.ndim];
+    dst_strides[0] = desc.cols;
+    dst_strides[1] = 1; 
+    src_strides[0] = desc.ld;
+    src_strides[1] = desc.ei;
+    shape[0] = desc.rows;
+    shape[1] = desc.cols;
+    size_t size = shape[1]*shape[0];
     if(datatype==TECODNN_DATA_HALF){
         size*=sizeof(uint16_t);
     }
@@ -81,8 +76,8 @@ infiniopStatus_t toContiguous(infiniopTensorDescriptor_t desc, void *data,tecodn
     }
     void *temp;
     sdaaMalloc(&temp,size);
-    tecodnnSetTensorNdDescriptor(src,datatype,desc->ndim,shape,old_strides);
-    tecodnnSetTensorNdDescriptor(dst,datatype,desc->ndim,shape,strides);
+    tecodnnSetTensorNdDescriptor(src,datatype,desc.ndim,shape,src_strides);
+    tecodnnSetTensorNdDescriptor(dst,datatype,desc.ndim,shape,dst_strides);
     tecodnnCopyStride(tecodnn_handle,src,data,dst,temp);
     sdaaMemcpy(data, temp, size, sdaaMemcpyDeviceToDevice);
     sdaaFree(temp);
diff --git a/src/devices/teco/common_teco.h b/src/devices/teco/common_teco.h
index 84d7c7fa..ba2d8ed2 100644
--- a/src/devices/teco/common_teco.h
+++ b/src/devices/teco/common_teco.h
@@ -17,10 +17,48 @@
             exit(EXIT_FAILURE);                                                                  \
         }                                                                                        \
     }
+
+typedef struct MatrixInfo {
+    int ndim;
+    int batch;
+    int64_t stride;
+    int rows;
+    int cols;
+    int ld;
+    int ei;
+
+    MatrixInfo() {}
+
+    MatrixInfo(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
+        if (layout->ndim == 2) {
+            this->ndim = 2;
+            this->batch = 1;
+            this->stride = 0;
+            this->rows = layout->shape[0];
+            this->cols = layout->shape[1];
+            this->ld = layout->strides[0];
+            this->ei = layout->strides[1];
+        } else if (layout->ndim == 3) {
+            this->ndim = 3;
+            this->batch = layout->shape[0];
+            this->stride = this->batch == 1 ? 0 : layout->strides[0];
+            this->rows = layout->shape[1];
+            this->cols = layout->shape[2];
+            this->ld = layout->strides[1];
+            this->ei = layout->strides[2];
+        } else {
+            *status = STATUS_BAD_TENSOR_SHAPE;
+            return;
+        }
+
+        *status = STATUS_SUCCESS;
+    }
+
+} MatrixInfo;
 void const** convertToBatch(void const* data, int batch, int stride, size_t typeSize);
-bool is_contiguous(infiniopTensorDescriptor_t desc);
-infiniopStatus_t toContiguous(infiniopTensorDescriptor_t desc,void *data,tecodnnDataType_t datatype);
-infiniopStatus_t restoreTensor(infiniopTensorDescriptor_t desc,void *data,tecodnnDataType_t datatype);
+bool is_contiguous(MatrixInfo desc);
+infiniopStatus_t toContiguous(MatrixInfo desc,void *data,tecodnnDataType_t datatype);
+infiniopStatus_t restoreTensor(MatrixInfo desc,void *data,tecodnnDataType_t datatype);
 
 infiniopStatus_t toTecodnnTensorDescriptor(infiniopTensorDescriptor_t src,tecodnnTensorDescriptor_t des);
 
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index a1bfc5e3..c5a7be91 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -2,6 +2,7 @@
 
 infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
     long long int batch,batch_count;
+    infiniopStatus_t status = STATUS_SUCCESS;
     if (a_desc->ndim == 2 && b_desc->ndim == 2) {
         batch = 0;
         batch_count = 1;
@@ -46,13 +47,13 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
         a_desc->strides[0],
         b_desc->strides[0],
         c_desc->strides[0],
-        a_desc,
-        b_desc,
-        c_desc,
+        MatrixInfo(a_desc,&status),
+        MatrixInfo(b_desc,&status),
+        MatrixInfo(c_desc,&status),
         };
     tecoblasSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
         
-    return STATUS_SUCCESS;
+    return status;
 }
 
 infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index 9392f27f..471c7963 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -17,7 +17,7 @@ struct MatmulTecoDescriptor {
     long long int lda,ldb,ldc;
     long long int batch,batch_count;
     long int strideA,strideB,strideC;
-    infiniopTensorDescriptor_t a_desc,b_desc,c_desc;
+    MatrixInfo a_desc,b_desc,c_desc;
 };
 
 typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;

From 4e94ca842904a4fee99d86f660944d1110081f7c Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Wed, 15 Jan 2025 09:06:38 +0800
Subject: [PATCH 12/15] repair matmul

---
 src/ops/matmul/teco/matmul_tecoblas.cc | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index c5a7be91..0a0cd6b9 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -70,7 +70,6 @@ infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_
                 CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
         }
     }else{
-        printf("some tensor is not contiguous\n");
         if(desc->batch==0){
             if(desc->datatype == TECOBLAS_DATA_HALF)
                 CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->n, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size)))
@@ -104,13 +103,9 @@ infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64
         }
     }else{
         if(desc->datatype == TECOBLAS_DATA_HALF){
-            toContiguous(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_HALF);
-            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->n, desc->beta, c, desc->ldc))
-            restoreTensor(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_HALF);
+            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, TECOBLAS_OP_T, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->k, desc->beta, c, desc->ldc))
         }else{
-            toContiguous(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_FLOAT);
-            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->n, desc->beta, c, desc->ldc))
-            restoreTensor(desc->b_desc,const_cast<void*>(b),TECODNN_DATA_FLOAT);
+            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, TECOBLAS_OP_T, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->k, desc->beta, c, desc->ldc))
         }
 
     }

From 35a38130eeafe456b4e3a9cf39edf78380c5d2df Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Thu, 16 Jan 2025 10:36:27 +0800
Subject: [PATCH 13/15] fix with infiniLM pass

---
 operatorspy/tests/matmul.py            |   2 +-
 src/ops/matmul/teco/matmul_tecoblas.cc | 166 ++++++++++++++++---------
 src/ops/matmul/teco/matmul_tecoblas.h  |   6 +-
 3 files changed, 110 insertions(+), 64 deletions(-)

diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index 51dc1d9c..064eead2 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -119,7 +119,7 @@ def test(
             None,
         )
     )
-    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
 
     if PROFILE:
         for i in range(NUM_PRERUN):
diff --git a/src/ops/matmul/teco/matmul_tecoblas.cc b/src/ops/matmul/teco/matmul_tecoblas.cc
index 0a0cd6b9..b1b1d9ad 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.cc
+++ b/src/ops/matmul/teco/matmul_tecoblas.cc
@@ -1,19 +1,68 @@
 #include "matmul_tecoblas.h"
 
 infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, float alpha, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, float beta) {
-    long long int batch,batch_count;
     infiniopStatus_t status = STATUS_SUCCESS;
+    tecoblasDataType_t datatype;
+    tecoblasOperation_t transA,transB,transC;
+    uint64_t m,k,n;
+    long long int lda,ldb,ldc;
+    long long int batch,batch_count;
+    long int strideA = 1,strideB = 1,strideC = 1;
     if (a_desc->ndim == 2 && b_desc->ndim == 2) {
         batch = 0;
         batch_count = 1;
     }else if(a_desc->ndim == 3 && b_desc->ndim == 3){
         batch = 1;
         batch_count = a_desc->shape[0];
+        strideA = a_desc->strides[0];
+        strideB = b_desc->strides[0];
+        strideC = c_desc->strides[0];
     }else{
         return STATUS_BAD_TENSOR_SHAPE;
     }
-
-    tecoblasDataType_t datatype;
+    /*MatrixA*/
+    if(a_desc->strides[1+batch] == 1 && (uint64_t)a_desc->strides[0+batch] >= a_desc->shape[1+batch]){
+        transA = TECOBLAS_OP_N;
+        m = a_desc->shape[0+batch];
+        k = a_desc->shape[1+batch];
+        lda = a_desc->strides[0+batch];
+    }else if(a_desc->strides[0+batch] == 1 && (uint64_t)a_desc->strides[1+batch] >= a_desc->shape[0+batch]){
+        transA = TECOBLAS_OP_T;
+        m = a_desc->shape[0+batch];
+        k = a_desc->shape[1+batch];
+        lda = a_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    /*MatrixB*/
+    if(b_desc->strides[1+batch] == 1 && (uint64_t)b_desc->strides[0+batch] >= b_desc->shape[1+batch]){
+        transB = TECOBLAS_OP_N;
+        k = b_desc->shape[0+batch];
+        n = b_desc->shape[1+batch];
+        ldb = b_desc->strides[0+batch];
+    }else if(b_desc->strides[0+batch] == 1 && (uint64_t)b_desc->strides[1+batch] >= b_desc->shape[0+batch]){
+        transB = TECOBLAS_OP_T;
+        k = b_desc->shape[0+batch];
+        n = b_desc->shape[1+batch];
+        ldb = b_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    /*MatrixC*/
+    if(c_desc->strides[1+batch] == 1 && (uint64_t)c_desc->strides[0+batch] >= c_desc->shape[1+batch]){
+        transC = TECOBLAS_OP_N;
+        m = c_desc->shape[0+batch];
+        n = c_desc->shape[1+batch];
+        ldc = c_desc->strides[0+batch];
+    }else if(c_desc->strides[0+batch] == 1 && (uint64_t)c_desc->strides[1+batch] >= c_desc->shape[0+batch]){
+        transC = TECOBLAS_OP_T;
+        m = c_desc->shape[0+batch];
+        n = c_desc->shape[1+batch];
+        ldc = c_desc->strides[1+batch];
+    }else{
+        return STATUS_BAD_TENSOR_SHAPE; 
+    }
+    
     if(a_desc->dt==F16 && b_desc->dt==F16){
         datatype = TECOBLAS_DATA_HALF;
     }else if(a_desc->dt==F32 && b_desc->dt==F32){
@@ -24,7 +73,6 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
 
     tecoblasHandle_t tecoblas_handle;
     tecoblasCreate(&tecoblas_handle);
-    
 
     *desc_ptr = new MatmulTecoDescriptor{
         handle->device,
@@ -32,24 +80,22 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
         tecoblas_handle,
         handle->stream,
         datatype,
-        TECOBLAS_OP_N,
-        TECOBLAS_OP_N,
-        a_desc->shape[0+batch],
-        a_desc->shape[1+batch],
-        b_desc->shape[1+batch],
+        transA,
+        transB,
+        transC,
+        m,
+        k,
+        n,
         alpha,
         beta,
-        a_desc->strides[0+batch],
-        b_desc->strides[0+batch],
-        c_desc->strides[0+batch],
+        lda,
+        ldb,
+        ldc,
         batch,
         batch_count,
-        a_desc->strides[0],
-        b_desc->strides[0],
-        c_desc->strides[0],
-        MatrixInfo(a_desc,&status),
-        MatrixInfo(b_desc,&status),
-        MatrixInfo(c_desc,&status),
+        strideA,
+        strideB,
+        strideC,
         };
     tecoblasSetStream((*desc_ptr)->handle,(*desc_ptr)->stream);
         
@@ -57,31 +103,42 @@ infiniopStatus_t tecoCreateMatmulDescriptor(TecoHandle_t handle, MatmulTecoDescr
 }
 
 infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_t *size) {
-    if(is_contiguous(desc->a_desc) && is_contiguous(desc->b_desc) && is_contiguous(desc->c_desc)){
-        if(desc->batch==0){
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->ldb, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size)))
-            else
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->ldb, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size)))
-        }else{
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
-            else
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->ldb, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
-        }
+    tecoblasAPIName_t apiName;
+    if (desc->batch == 0)
+    {
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            apiName = TECOBLAS_HGEMM;
+        else
+            apiName = TECOBLAS_SGEMM;
     }else{
-        if(desc->batch==0){
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, 1, TECOBLAS_DATA_HALF, desc->n, 1, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, 1, desc->batch_count, TECOBLAS_HGEMM,reinterpret_cast<size_t*>(size)))
-            else
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, 1, TECOBLAS_DATA_FLOAT, desc->n, 1, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, 1, desc->batch_count, TECOBLAS_SGEMM,reinterpret_cast<size_t*>(size)))
-        }else{
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_HALF,desc->lda, desc->strideA, TECOBLAS_DATA_HALF, desc->n, desc->strideB, desc->beta, TECOBLAS_DATA_HALF, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_HGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
-            else
-                CHECK_TECOBLAS(tecoblasGetWorkspaceSize(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, TECOBLAS_DATA_FLOAT,desc->lda, desc->strideA, TECOBLAS_DATA_FLOAT, desc->n, desc->strideB, desc->beta, TECOBLAS_DATA_FLOAT, desc->ldc, desc->strideC, desc->batch_count, TECOBLAS_SGEMM_STRIDED_BATCHED,reinterpret_cast<size_t*>(size)))
-        }
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            apiName = TECOBLAS_HGEMM_STRIDED_BATCHED;
+        else
+            apiName = TECOBLAS_SGEMM_STRIDED_BATCHED;
     }
+    CHECK_TECOBLAS(tecoblasGetWorkspaceSize(
+        desc->handle, 
+        desc->transa, 
+        desc->transb, 
+        desc->m, 
+        desc->n, 
+        desc->k, 
+        desc->alpha, 
+        desc->datatype,
+        desc->lda, 
+        desc->strideA, 
+        desc->datatype, 
+        desc->ldb, 
+        desc->strideB, 
+        desc->beta, 
+        desc->datatype, 
+        desc->ldc, 
+        desc->strideC, 
+        desc->batch_count, 
+        apiName,
+        reinterpret_cast<size_t*>(size)))
+    
+
         
     return STATUS_SUCCESS;
 }
@@ -89,25 +146,16 @@ infiniopStatus_t tecoGetMatmulWorkspaceSize(MatmulTecoDescriptor_t desc, uint64_
 infiniopStatus_t tecoMatmul(MatmulTecoDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, const void *a, const void *b, void *stream) {
     tecoblasSetStream(desc->handle, desc->stream);
     tecoblasSetWorkspace(desc->handle, workspace, workspace_size);
-    if(is_contiguous(desc->a_desc)  && is_contiguous(desc->b_desc) && is_contiguous(desc->c_desc)){
-        if(desc->batch==0){
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
-            else
-                CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
-        }else{
-            if(desc->datatype == TECOBLAS_DATA_HALF)
-                CHECK_TECOBLAS(tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
-            else
-                CHECK_TECOBLAS(tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
-        }
+    if(desc->batch==0){
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
+        else
+            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->ldb, desc->beta, c, desc->ldc))
     }else{
-        if(desc->datatype == TECOBLAS_DATA_HALF){
-            CHECK_TECOBLAS(tecoblasHgemm(desc->handle, desc->transa, TECOBLAS_OP_T, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->k, desc->beta, c, desc->ldc))
-        }else{
-            CHECK_TECOBLAS(tecoblasSgemm(desc->handle, desc->transa, TECOBLAS_OP_T, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda, b, desc->k, desc->beta, c, desc->ldc))
-        }
-
+        if(desc->datatype == TECOBLAS_DATA_HALF)
+            CHECK_TECOBLAS(tecoblasHgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
+        else
+            CHECK_TECOBLAS(tecoblasSgemmStridedBatched(desc->handle, desc->transa, desc->transb, desc->m, desc->n, desc->k, desc->alpha, a, desc->lda,desc->strideA, b, desc->ldb,desc->strideB, desc->beta, c, desc->ldc,desc->strideC,desc->batch_count))
     }
     sdaaStreamSynchronize(desc->stream);
     return STATUS_SUCCESS;
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index 471c7963..98e90228 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -11,14 +11,12 @@ struct MatmulTecoDescriptor {
     tecoblasHandle_t handle;
     sdaaStream_t stream;
     tecoblasDataType_t datatype;
-    tecoblasOperation_t transa,transb;
+    tecoblasOperation_t transa,transb,transc;
     uint64_t m,k,n;
     float alpha,beta;
     long long int lda,ldb,ldc;
     long long int batch,batch_count;
-    long int strideA,strideB,strideC;
-    MatrixInfo a_desc,b_desc,c_desc;
-};
+    long int strideA,strideB,strideC;};
 
 typedef struct MatmulTecoDescriptor *MatmulTecoDescriptor_t;
 

From 817a2bae5a8c7f6d8ba4a8f504467c62002947d7 Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Thu, 6 Feb 2025 14:26:24 +0800
Subject: [PATCH 14/15] complete rearrange operator

---
 operatorspy/tests/rearrange.py              | 13 ++++++
 operatorspy/tests/swiglu.py                 | 16 ++++++-
 src/ops/matmul/teco/matmul_tecoblas.h       |  1 -
 src/ops/rearrange/operator.cc               | 23 ++++++++++
 src/ops/rearrange/teco/rearrange_tecodnn.cc | 51 +++++++++++++++++++++
 src/ops/rearrange/teco/rearrange_tecodnn.h  | 33 +++++++++++++
 src/ops/swiglu/operator.cc                  | 19 ++++++++
 src/ops/swiglu/teco/swiglu_tecodnn.cc       | 44 ++++++++++++++++++
 src/ops/swiglu/teco/swiglu_tecodnn.h        | 33 +++++++++++++
 9 files changed, 231 insertions(+), 2 deletions(-)
 create mode 100644 src/ops/rearrange/teco/rearrange_tecodnn.cc
 create mode 100644 src/ops/rearrange/teco/rearrange_tecodnn.h
 create mode 100644 src/ops/swiglu/teco/swiglu_tecodnn.cc
 create mode 100644 src/ops/swiglu/teco/swiglu_tecodnn.h

diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py
index 005b9d95..1e8cf504 100644
--- a/operatorspy/tests/rearrange.py
+++ b/operatorspy/tests/rearrange.py
@@ -104,6 +104,17 @@ def test_ascend(lib, test_cases):
         test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
     destroy_handle(lib, handle) 
 
+def test_teco(lib, test_cases):
+    import torch_sdaa
+
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "sdaa", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
 if __name__ == "__main__":
     args = get_args()
     test_cases = [
@@ -140,3 +151,5 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
+    if args.teco:
+        test_teco(lib, test_cases)
diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py
index 57e4e3b9..b8a6ea6b 100644
--- a/operatorspy/tests/swiglu.py
+++ b/operatorspy/tests/swiglu.py
@@ -79,7 +79,6 @@ def test_out_of_place(
             descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
         )
     )
-
     assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
     print("out-of-place Test passed!")
 
@@ -234,6 +233,19 @@ def test_ascend(lib, test_cases):
         test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
 
     destroy_handle(lib, handle) 
+def test_teco(lib, test_cases):
+    import torch_sdaa
+    device = DeviceEnum.DEVICE_TECO
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "sdaa", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "sdaa", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "sdaa", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle) 
 
 
 if __name__ == "__main__":
@@ -278,3 +290,5 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
+    if args.teco:
+        test_teco(lib, test_cases)
diff --git a/src/ops/matmul/teco/matmul_tecoblas.h b/src/ops/matmul/teco/matmul_tecoblas.h
index 98e90228..2b5ec78c 100644
--- a/src/ops/matmul/teco/matmul_tecoblas.h
+++ b/src/ops/matmul/teco/matmul_tecoblas.h
@@ -3,7 +3,6 @@
 #include "operators.h"
 #include <sdaa_runtime.h>
 #include <tecoblas.h>
-#include <tecodnn.h>
 #include "../../../devices/teco/teco_handle.h"
 struct MatmulTecoDescriptor {
     Device device;
diff --git a/src/ops/rearrange/operator.cc b/src/ops/rearrange/operator.cc
index a1084d48..f0aef0af 100644
--- a/src/ops/rearrange/operator.cc
+++ b/src/ops/rearrange/operator.cc
@@ -17,6 +17,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/rearrange_aclnn.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/rearrange_tecodnn.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
     infiniopHandle_t handle,
@@ -46,6 +49,13 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
                                                   dst,
                                                   src);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: 
+            return tecoCreateRearrangeDescriptor((TecoHandle_t) handle,
+                                                (RearrangeTecoDescriptor_t *) desc_ptr,
+                                                dst,
+                                                src);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -75,6 +85,14 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
                                   src,
                                   stream);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoRearrange((RearrangeTecoDescriptor_t) desc,
+                                  dst,
+                                  src,
+                                  stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -101,6 +119,11 @@ __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescrip
         case DevAscendNpu: {
             return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
         }
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: {
+            return tecoDestroyRearrangeDescriptor((RearrangeTecoDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/rearrange/teco/rearrange_tecodnn.cc b/src/ops/rearrange/teco/rearrange_tecodnn.cc
new file mode 100644
index 00000000..00abbc2a
--- /dev/null
+++ b/src/ops/rearrange/teco/rearrange_tecodnn.cc
@@ -0,0 +1,51 @@
+#include "rearrange_tecodnn.h"
+
+infiniopStatus_t tecoCreateRearrangeDescriptor(TecoHandle_t handle, RearrangeTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+
+    tecodnnTensorDescriptor_t srcDesc,dstDesc;
+    tecodnnCreateTensorDescriptor(&srcDesc);
+    tecodnnCreateTensorDescriptor(&dstDesc);
+
+    int nbDims = dst->ndim;
+
+    int *shape = new int[nbDims];
+    int *src_strides = new int[nbDims];
+    int *dst_strides = new int[nbDims];
+    for (size_t i = 0; i < (size_t)nbDims; i++)
+    {
+        shape[i] = dst->shape[i];
+        src_strides[i] = src->strides[i];
+        dst_strides[i] = dst->strides[i];
+    }
+
+    tecodnnSetTensorNdDescriptor(srcDesc, TECODNN_DATA_HALF, nbDims, shape, src_strides);
+    tecodnnSetTensorNdDescriptor(dstDesc, TECODNN_DATA_HALF, nbDims, shape, dst_strides);
+    
+    
+    *desc_ptr = new RearrangeTecoDescriptor{
+        DevTecoSDAA,
+        handle->device_id,
+        handle->stream,
+        tecodnn_handle,
+        nbDims,
+        shape,
+        src_strides,
+        dst_strides,
+        srcDesc,
+        dstDesc,
+    };
+
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoRearrange(RearrangeTecoDescriptor_t desc, void *dst, void const *src, void *stream) {
+    tecodnnCopyStride(desc->handle, desc->srcDesc, src, desc->dstDesc, dst);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroyRearrangeDescriptor(RearrangeTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/teco/rearrange_tecodnn.h b/src/ops/rearrange/teco/rearrange_tecodnn.h
new file mode 100644
index 00000000..2df2b073
--- /dev/null
+++ b/src/ops/rearrange/teco/rearrange_tecodnn.h
@@ -0,0 +1,33 @@
+#ifndef __TECO_REARRANGE_H__
+#define __TECO_REARRANGE_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+struct RearrangeTecoDescriptor {
+    Device device;
+    int device_id;
+    sdaaStream_t stream;
+    tecodnnHandle_t handle;
+    int nbDims;
+    int *shape,*src_strides,*dst_strides;
+    tecodnnTensorDescriptor_t srcDesc,dstDesc;
+};
+
+typedef struct RearrangeTecoDescriptor *RearrangeTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateRearrangeDescriptor(TecoHandle_t handle,
+                                             RearrangeTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t dst,                                             
+                                             infiniopTensorDescriptor_t src);
+
+infiniopStatus_t tecoRearrange(RearrangeTecoDescriptor_t desc,
+                            void *dst,
+                            void const *src,
+                            void *stream);
+
+infiniopStatus_t tecoDestroyRearrangeDescriptor(RearrangeTecoDescriptor_t desc);
+
+
+#endif
\ No newline at end of file
diff --git a/src/ops/swiglu/operator.cc b/src/ops/swiglu/operator.cc
index b0bcb35c..879918f7 100644
--- a/src/ops/swiglu/operator.cc
+++ b/src/ops/swiglu/operator.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "ascend/swiglu.h"
 #endif
+#ifdef ENABLE_TECO_SDAA
+#include "teco/swiglu_tecodnn.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
                                                     infiniopSwiGLUDescriptor_t *desc_ptr,
@@ -45,6 +48,14 @@ __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
                                                 c_desc,
                                                 a_desc,
                                                 b_desc);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA: 
+            return tecoCreateSwiGLUDescriptor((TecoHandle_t) handle,
+                                                (SwiGLUTecoDescriptor_t *) desc_ptr,
+                                                c_desc,
+                                                a_desc,
+                                                b_desc);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -72,6 +83,10 @@ __C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu:
             return ascendSwiGLU((SwiGLUAscendDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA:
+            return tecoSwiGLU((SwiGLUTecoDescriptor_t) desc, c, a, b, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -95,6 +110,10 @@ __C infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t
 #ifdef ENABLE_ASCEND_NPU
         case DevAscendNpu:
             return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t) desc);
+#endif
+#ifdef ENABLE_TECO_SDAA
+        case DevTecoSDAA:
+            return tecoDestroySwiGLUDescriptor((SwiGLUTecoDescriptor_t) desc);
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/swiglu/teco/swiglu_tecodnn.cc b/src/ops/swiglu/teco/swiglu_tecodnn.cc
new file mode 100644
index 00000000..a2851639
--- /dev/null
+++ b/src/ops/swiglu/teco/swiglu_tecodnn.cc
@@ -0,0 +1,44 @@
+#include "swiglu_tecodnn.h"
+
+infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle, SwiGLUTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,infiniopTensorDescriptor_t b_desc) {
+    //create tecodnn hanele
+    long int N = c_desc->shape[0],C = c_desc->shape[1];
+    tecodnnHandle_t tecodnn_handle;
+    tecodnnCreate(&tecodnn_handle);
+
+    tecodnnActivationDescriptor_t activationDesc;
+    tecodnnCreateActivationDescriptor(&activationDesc);
+    tecodnnSetActivationDescriptor(activationDesc, TECODNN_ACTIVATION_SILU, TECODNN_NOT_PROPAGATE_NAN, 0.0);
+
+    tecodnnTensorDescriptor_t aDesc,bDesc,cDesc;
+    tecodnnCreateTensorDescriptor(&aDesc);
+    tecodnnCreateTensorDescriptor(&bDesc);
+    tecodnnCreateTensorDescriptor(&cDesc);
+
+    tecodnnSetTensor4dDescriptor(aDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
+    tecodnnSetTensor4dDescriptor(bDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
+    tecodnnSetTensor4dDescriptor(cDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
+
+    *desc_ptr = new SwiGLUTecoDescriptor{
+        handle->device,
+        handle->device_id,
+        handle->stream,
+        tecodnn_handle,
+        activationDesc,
+        aDesc,
+        bDesc,
+        cDesc,
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
+    float alpha = 1.0f,beta = 0.0f;
+    tecodnnActivationForward(desc->handle,desc->activationDesc,(void*)&alpha,desc->bDesc,b,(void*)&beta,desc->cDesc,c);
+    tecodnnMulTensorEx(desc->handle, desc->aDesc, a, desc->cDesc, c, desc->cDesc, c);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/teco/swiglu_tecodnn.h b/src/ops/swiglu/teco/swiglu_tecodnn.h
new file mode 100644
index 00000000..f3b1d9dc
--- /dev/null
+++ b/src/ops/swiglu/teco/swiglu_tecodnn.h
@@ -0,0 +1,33 @@
+#ifndef __TECO_SWIGLU_H__
+#define __TECO_SWIGLU_H__
+
+#include "operators.h"
+#include <sdaa_runtime.h>
+#include <tecodnn.h>
+#include "../../../devices/teco/teco_handle.h"
+struct SwiGLUTecoDescriptor {
+    Device device;
+    int device_id;
+    sdaaStream_t stream;
+    tecodnnHandle_t handle;
+    tecodnnActivationDescriptor_t activationDesc;
+    tecodnnTensorDescriptor_t aDesc,bDesc,cDesc;
+};
+
+typedef struct SwiGLUTecoDescriptor *SwiGLUTecoDescriptor_t;
+
+infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle,
+                                             SwiGLUTecoDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,                                             
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc);
+
+#endif
\ No newline at end of file

From 6f6019a1e19d33729f48182c5416d82e84871a2f Mon Sep 17 00:00:00 2001
From: crapromer <tianniehao@hotmail.com>
Date: Mon, 24 Feb 2025 09:07:03 +0800
Subject: [PATCH 15/15] implement swiglu inplace and out of place operation

---
 operatorspy/tests/swiglu.py                   |  1 -
 src/devices/teco/tensor_teco.cpp              | 23 ------
 src/devices/teco/tensor_teco.h                | 26 -------
 src/ops/swiglu/operator.cc                    |  2 +-
 .../teco/{swiglu_tecodnn.h => swiglu_sdaa.h}  | 15 ++--
 src/ops/swiglu/teco/swiglu_sdaa.scpp          | 70 +++++++++++++++++++
 src/ops/swiglu/teco/swiglu_tecodnn.cc         | 44 ------------
 xmake.lua                                     |  2 +-
 8 files changed, 79 insertions(+), 104 deletions(-)
 delete mode 100644 src/devices/teco/tensor_teco.cpp
 delete mode 100644 src/devices/teco/tensor_teco.h
 rename src/ops/swiglu/teco/{swiglu_tecodnn.h => swiglu_sdaa.h} (78%)
 create mode 100644 src/ops/swiglu/teco/swiglu_sdaa.scpp
 delete mode 100644 src/ops/swiglu/teco/swiglu_tecodnn.cc

diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py
index b8a6ea6b..6baf7358 100644
--- a/operatorspy/tests/swiglu.py
+++ b/operatorspy/tests/swiglu.py
@@ -124,7 +124,6 @@ def test_in_place1(
             descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
         )
     )
-
     assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
     print("in-place1 Test passed!")
 
diff --git a/src/devices/teco/tensor_teco.cpp b/src/devices/teco/tensor_teco.cpp
deleted file mode 100644
index 6b3b7575..00000000
--- a/src/devices/teco/tensor_teco.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "tensor_teco.h"
-
-
-infiniopStatus_t tecoTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc) {
-    uint64_t ndim = y->ndim;
-    // Cast shape type
-    auto shape = new std::vector<int64_t>(ndim);
-    auto strides = new std::vector<int64_t>(ndim);
-    for (uint64_t i = 0; i < ndim; ++i) {
-        (*shape)[i] = static_cast<int64_t>(y->shape[i]);
-        (*strides)[i] = y->strides[i];
-    }
-    tecoblasDataType_t dt;
-    if (dtype_eq(y->dt, F16)) {
-        dt = tecoblasDataType_t::TECOBLAS_DATA_FLOAT;
-    } else if (dtype_eq(y->dt, F32)) {
-        dt = aclDataType::TECOBLAS_DATA_DOUBLE;
-    } else {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return STATUS_SUCCESS;
-}
\ No newline at end of file
diff --git a/src/devices/teco/tensor_teco.h b/src/devices/teco/tensor_teco.h
deleted file mode 100644
index 906edd7b..00000000
--- a/src/devices/teco/tensor_teco.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __TECO_TENSOR__
-#define __TECO_TENSOR__
-
-#include "operators.h"
-#include "tensor.h"
-#include <sdaa_runtime.h>
-#include <tecoblas.h>
-#include <vector>
-
-struct tecoTensorDescriptor {
-    uint64_t ndim;
-    int64_t *shape;
-    int64_t *strides;
-    tecoblasDataType_t data_type;
-    tecodnnDataType_t data_type;
-    tecodnnTensorDescriptor_t 
-    infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
-    infiniopStatus_t createTensor();
-    infiniopStatus_t destroyTensor();
-    ~tecoTensorDescriptor();
-
-};
-
-typedef tecoTensorDescriptor *tecoTensorDescriptor_t;
-
-#endif
\ No newline at end of file
diff --git a/src/ops/swiglu/operator.cc b/src/ops/swiglu/operator.cc
index 879918f7..3d2c32ee 100644
--- a/src/ops/swiglu/operator.cc
+++ b/src/ops/swiglu/operator.cc
@@ -15,7 +15,7 @@
 #include "ascend/swiglu.h"
 #endif
 #ifdef ENABLE_TECO_SDAA
-#include "teco/swiglu_tecodnn.h"
+#include "teco/swiglu_sdaa.h"
 #endif
 
 __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
diff --git a/src/ops/swiglu/teco/swiglu_tecodnn.h b/src/ops/swiglu/teco/swiglu_sdaa.h
similarity index 78%
rename from src/ops/swiglu/teco/swiglu_tecodnn.h
rename to src/ops/swiglu/teco/swiglu_sdaa.h
index f3b1d9dc..f41f325f 100644
--- a/src/ops/swiglu/teco/swiglu_tecodnn.h
+++ b/src/ops/swiglu/teco/swiglu_sdaa.h
@@ -1,24 +1,22 @@
-#ifndef __TECO_SWIGLU_H__
-#define __TECO_SWIGLU_H__
-
+#ifndef __SDAA_SWIGLU_H__
+#define __SDAA_SWIGLU_H__
 #include "operators.h"
 #include <sdaa_runtime.h>
-#include <tecodnn.h>
 #include "../../../devices/teco/teco_handle.h"
 struct SwiGLUTecoDescriptor {
     Device device;
     int device_id;
     sdaaStream_t stream;
-    tecodnnHandle_t handle;
-    tecodnnActivationDescriptor_t activationDesc;
-    tecodnnTensorDescriptor_t aDesc,bDesc,cDesc;
+    uint64_t rows,cols;
+    int64_t lda,ldb,ldc;
 };
 
 typedef struct SwiGLUTecoDescriptor *SwiGLUTecoDescriptor_t;
 
+
 infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle,
                                              SwiGLUTecoDescriptor_t *desc_ptr,
-                                             infiniopTensorDescriptor_t c_desc,                                             
+                                             infiniopTensorDescriptor_t c_desc,
                                              infiniopTensorDescriptor_t a_desc,
                                              infiniopTensorDescriptor_t b_desc);
 
@@ -30,4 +28,5 @@ infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc,
 
 infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc);
 
+
 #endif
\ No newline at end of file
diff --git a/src/ops/swiglu/teco/swiglu_sdaa.scpp b/src/ops/swiglu/teco/swiglu_sdaa.scpp
new file mode 100644
index 00000000..c97c6410
--- /dev/null
+++ b/src/ops/swiglu/teco/swiglu_sdaa.scpp
@@ -0,0 +1,70 @@
+#include "swiglu_sdaa.h"
+__local__ halfv16 tempa, tempb, tempc;
+
+__device__ void silu_halfv16(halfv16 *c, halfv16 *a, halfv16 *b) {
+    floatv16 one_v = simd_stretch(1.0f);
+    floatv16 a_silu = simd_div(simd_cvt_h2f(*b), simd_add(one_v, simd_exp(0 - simd_cvt_h2f(*b))));
+    halfv16 out = simd_cvt_f2h(simd_mul(simd_cvt_h2f(*a), a_silu));
+    *c = out;
+}
+
+__device__ void silu_half(half *c, const half *a, const half *b) {
+    *c = (*b) * (*a)/ (1.0 + expf(0 - *b)) ;
+}
+
+__global__ void swiglu(half *c, half const *a, half const *b, size_t rows, size_t cols, size_t lda, size_t ldb, size_t ldc) {
+    int vector_size = 16;
+    for (size_t i = 0; i < rows / threadDim + 1; i++) {
+        if (threadIdx < rows - i * threadDim) {
+            size_t j = 0;
+            for (; j < cols / vector_size; j++) {
+                simd_load(tempa, a + (threadIdx + i * threadDim) * lda + j * vector_size);
+                simd_load(tempb, b + (threadIdx + i * threadDim) * ldb + j * vector_size);
+                silu_halfv16(&tempc, &tempa, &tempb);
+                simd_store(tempc, c + (threadIdx + i * threadDim) * ldc + j * vector_size);
+            }
+            for (size_t k = 0; k < cols - j * vector_size; k++)
+            {
+                silu_half(
+                c + (threadIdx + i * threadDim) * ldc + j * vector_size + k,
+                a + (threadIdx + i * threadDim) * lda + j * vector_size + k,
+                b + (threadIdx + i * threadDim) * ldb + j * vector_size + k);
+            }
+            
+        }
+    }
+}
+
+infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle,
+                                            SwiGLUTecoDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    *desc_ptr = new SwiGLUTecoDescriptor{
+        handle->device,
+        handle->device_id,
+        handle->stream,
+        a_desc->shape[0],
+        a_desc->shape[1],
+        a_desc->strides[0],
+        b_desc->strides[0],
+        c_desc->strides[0],
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    auto a_ptr = reinterpret_cast<const half *>(a);
+    auto b_ptr = reinterpret_cast<const half *>(b);
+    auto c_ptr = reinterpret_cast<half *>(c);
+    swiglu<<<1>>>(c_ptr, a_ptr, b_ptr, desc->rows, desc->cols, desc->lda, desc->ldb, desc->ldc);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc) {
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/teco/swiglu_tecodnn.cc b/src/ops/swiglu/teco/swiglu_tecodnn.cc
deleted file mode 100644
index a2851639..00000000
--- a/src/ops/swiglu/teco/swiglu_tecodnn.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "swiglu_tecodnn.h"
-
-infiniopStatus_t tecoCreateSwiGLUDescriptor(TecoHandle_t handle, SwiGLUTecoDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,infiniopTensorDescriptor_t b_desc) {
-    //create tecodnn hanele
-    long int N = c_desc->shape[0],C = c_desc->shape[1];
-    tecodnnHandle_t tecodnn_handle;
-    tecodnnCreate(&tecodnn_handle);
-
-    tecodnnActivationDescriptor_t activationDesc;
-    tecodnnCreateActivationDescriptor(&activationDesc);
-    tecodnnSetActivationDescriptor(activationDesc, TECODNN_ACTIVATION_SILU, TECODNN_NOT_PROPAGATE_NAN, 0.0);
-
-    tecodnnTensorDescriptor_t aDesc,bDesc,cDesc;
-    tecodnnCreateTensorDescriptor(&aDesc);
-    tecodnnCreateTensorDescriptor(&bDesc);
-    tecodnnCreateTensorDescriptor(&cDesc);
-
-    tecodnnSetTensor4dDescriptor(aDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
-    tecodnnSetTensor4dDescriptor(bDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
-    tecodnnSetTensor4dDescriptor(cDesc,TECODNN_TENSOR_NCHW,TECODNN_DATA_HALF,N,1,1,C);
-
-    *desc_ptr = new SwiGLUTecoDescriptor{
-        handle->device,
-        handle->device_id,
-        handle->stream,
-        tecodnn_handle,
-        activationDesc,
-        aDesc,
-        bDesc,
-        cDesc,
-    };
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t tecoSwiGLU(SwiGLUTecoDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
-    float alpha = 1.0f,beta = 0.0f;
-    tecodnnActivationForward(desc->handle,desc->activationDesc,(void*)&alpha,desc->bDesc,b,(void*)&beta,desc->cDesc,c);
-    tecodnnMulTensorEx(desc->handle, desc->aDesc, a, desc->cDesc, c, desc->cDesc, c);
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t tecoDestroySwiGLUDescriptor(SwiGLUTecoDescriptor_t desc) {
-    return STATUS_SUCCESS;
-}
diff --git a/xmake.lua b/xmake.lua
index 942edb58..64b406ff 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -243,7 +243,7 @@ if has_config("teco") then
             local cc = "/opt/tecoai/bin/tecocc"
 
             local includedirs = table.concat(target:get("includedirs"), " ")
-            local args = {sourcefile, "-o", objectfile}
+            local args = {sourcefile, "-o", objectfile,"-O2", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread","-c"}
             
             for _, includedir in ipairs(target:get("includedirs")) do
                 table.insert(args, "-I" .. includedir)