From 093436a24707ea048f6aac61d0e95c750e02e1a9 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Sun, 11 Jan 2026 23:52:06 +0800
Subject: [PATCH 1/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   4 +-
 ...47\350\241\214\351\252\214\350\257\201.md" |  22 +
 README.md                                     | 113 +--
 examples/llama.py                             |  82 +-
 include/infinicore_infer.h                    |   1 +
 include/infinicore_infer/models/Qwen3MoE.h    |  88 ++
 include/infinicore_infer/models/deepseek.h    |   1 +
 python/infinilm/__init__.py                   |   4 +-
 python/infinilm/cache_utils.py                |   8 +-
 python/infinilm/generation/utils.py           | 114 +--
 python/infinilm/modeling_utils.py             | 195 +---
 python/infinilm/models/llama/__init__.py      |  33 +-
 python/infinilm/models/llama/backends/cpp.py  |  38 +
 .../models/llama/configuration_llama.py       |  21 +-
 .../infinilm/models/llama/modeling_llama.py   |  95 +-
 scripts/jiuge.py                              |   6 +-
 scripts/jiuge_ppl.py                          |   2 -
 scripts/launch_server.py                      |   1 -
 scripts/libinfinicore_infer/__init__.py       |  14 +
 scripts/libinfinicore_infer/base.py           |   1 -
 scripts/libinfinicore_infer/qwen3_moe.py      | 115 +++
 src/models/Qwen3MoE/Qwen3MoE.cpp              | 606 +++++++++++++
 src/models/Qwen3MoE/Qwen3MoE_cache.cpp        |  50 ++
 src/models/Qwen3MoE/Qwen3MoE_impl.hpp         | 133 +++
 src/models/Qwen3MoE/Qwen3MoE_weight.cpp       | 268 ++++++
 src/models/deepseek_v3/deepseek_v3.cpp        |   1 -
 src/models/deepseek_v3/deepseek_v3_cache.cpp  |   1 +
 src/models/deepseek_v3/deepseek_v3_impl.hpp   | 201 ++---
 src/models/deepseek_v3/deepseek_v3_weight.cpp |   1 -
 src/models/inference_context.cpp              |  15 +
 test/models/qwen3_moe/attention_test.py       | 837 +++++++++---------
 xmake.lua                                     |  64 +-
 32 files changed, 2085 insertions(+), 1050 deletions(-)
 create mode 100644 ".trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"
 create mode 100644 include/infinicore_infer/models/Qwen3MoE.h
 create mode 100644 python/infinilm/models/llama/backends/cpp.py
 create mode 100644 scripts/libinfinicore_infer/qwen3_moe.py
 create mode 100644 src/models/Qwen3MoE/Qwen3MoE.cpp
 create mode 100644 src/models/Qwen3MoE/Qwen3MoE_cache.cpp
 create mode 100644 src/models/Qwen3MoE/Qwen3MoE_impl.hpp
 create mode 100644 src/models/Qwen3MoE/Qwen3MoE_weight.cpp

diff --git a/.gitignore b/.gitignore
index 767db187..0c9ef52c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 # Xmake cache
 .xmake/
 build/
-python/infinilm/lib/*.so
 
 # MacOS Cache
 .DS_Store
@@ -11,13 +10,12 @@ python/infinilm/lib/*.so
 
 # Python
 __pycache__/
-*.egg-info/
 
 # Log
 *.log
 
 # Cache
-.cache/
+cache/
 
 # JSON
 *.json
diff --git "a/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md" "b/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"
new file mode 100644
index 00000000..6680a970
--- /dev/null
+++ "b/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"	
@@ -0,0 +1,22 @@
+根据您的指令和之前的分析，我制定了以下计划来彻底修复精度问题并进行验证。
+
+核心思路是：不仅在 `inject` 阶段，在 `forward` 阶段的 Gather Buffer 也必须初始化为 `-inf`，以防止 Padding 区域的 0 值干扰 Softmax 计算。
+
+### 1. 代码完善 (C++)
+在 `src/models/Qwen3MoE/Qwen3MoE.cpp` 中继续修改 `forwardQwen3MoEAttention` 函数：
+- 找到 `k_padded_gather` 和 `v_padded_gather` 的初始化代码。
+- 将原本的 `cudaMemsetAsync` (清零) 替换为我们新实现的 `launch_fill_val_bf16` (填充 `-inf`)。
+- **原因**：这是解决 `past=0` 场景下精度下降（0.99 -> 0.98/0.96）的关键，确保 Padding 不会参与 Attention 权重计算。
+
+### 2. 编译与运行
+使用您提供的完整环境命令进行编译和测试：
+```bash
+cd '/data/users/lviy/InfiniLM' ; source /data/apps/env.sh && source /data/apps/miniforge3/etc/profile.d/conda.sh && conda activate py313 && export INFINI_ROOT=$HOME/.infini && export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH && export PATH="/data/apps/xmake/bin:/usr/local/cuda/bin:$PATH" && xmake && srun --gres=gpu:nvidia:1 --cpus-per-task=8 --mem=16G python test/models/qwen3_moe/attention_test.py --model_path "/data/shared/models/Qwen3-30B-A3B-Instruct-2507-Layer-0" --nvidia
+```
+
+### 3. 结果验证
+观察输出日志：
+- **Debug Log**: 确认 `[Inject]` 的分配逻辑是否如预期（Batch 0 Alloc, Batch 3 Reuse）。
+- **Cosine Similarity**: 检查是否恢复到 > 0.99（预期 0.0000 问题应随之解决，因为构建环境已修复）。
+
+如果测试通过，我将删除 Debug Print 并交付最终代码。
\ No newline at end of file
diff --git a/README.md b/README.md
index 350d2d9e..791217cc 100644
--- a/README.md
+++ b/README.md
@@ -15,19 +15,19 @@ xmake && xmake install
 - 运行模型推理测试
 
 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
+python scripts/jiuge.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
 ```
 
 - 部署模型推理服务
 
 ```bash
-python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
 ```
 
 - 测试模型推理服务性能
 
 ```bash
-python scripts/test_perf.py
+python scripts/test_perf.py 
 ```
 
 - 使用推理服务测试模型困惑度（Perplexity）
@@ -37,98 +37,21 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
 ```
 
 ## 使用方式(新版)
-#### 一、编译并安装 `InfiniCore`
-编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
 
-- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
-- 根据硬件平台，选择 xmake 构建配置
-- 编译安装InfiniCore
-- 安装 C++ 库
-- 安装 Python 包
+- 编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
+    
+    - 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+    - 根据硬件平台，选择 xmake 构建配置
+    - 编译安装InfiniCore
+    - 安装 C++ 库
+    - 安装 Python 包
 
-
-#### 二、编译并安装  `InfiniLM`
-  - 克隆项目
-
-    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
-
-    ```shell
-    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
-    ```
-
-    或者在普通克隆后进行更新：
-
-    ```shell
-    git submodule update --init --recursive
-    ```
-
-
-  - 安装 InfiniLM Python 包
-    ```bash
-      pip install -e .
-    ```
-
-  - 单次推理测试
+- 单次推理测试
     - llama示例
-    ```bash
-    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
-    ```
-    - 例如：
-    ```bash
-    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
-    ```
-  - 分布式推理测试
-      - 9g示例
-      ```bash
-    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
-    ```
-
-    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
-    ```bash
-    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
-    ```
-
-  - 运行推理基准测试（C-Eval/MMLU）
-
-    ```bash
-    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
-    ```
-
-    - 参数说明：
-      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
-      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
-      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
-
-    - C-Eval示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 全部科目并输出CSV：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
-
-    - MMLU示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+```bash
+python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+```
+例如：
+```bash
+python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
+```
\ No newline at end of file
diff --git a/examples/llama.py b/examples/llama.py
index aa890ca9..611a5866 100644
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -1,15 +1,17 @@
-import infinicore
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-from infinilm.modeling_utils import get_model_state_dict
-import infinilm
-import argparse
 import sys
 import time
 import os
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
 
+import argparse
+import infinilm
+from infinilm.modeling_utils import get_model_state_dict
+from tokenizers import decoders as _dec
+from transformers import AutoTokenizer
+
+import infinicore
+
 
 def get_args():
     parser = argparse.ArgumentParser(description="run Llama args")
@@ -57,35 +59,22 @@ def get_args():
         default="python",
         help="python or cpp model",
     )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="number of prompts in a batch",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="How are you",
-        help="input prompt",
-    )
-
     return parser.parse_args()
 
 
 def test(
-    prompts: str | list[str],
+    prompt,
     model_path,
     max_new_tokens=100,
+    infini_dtype=infinicore.bfloat16,
     infini_device=infinicore.device("cpu", 0),
+    backend="python",
 ):
-    model_path = os.path.expanduser(model_path)
     # ---------------------------------------------------------------------------- #
     #                        创建模型,
     # ---------------------------------------------------------------------------- #
     model = infinilm.AutoLlamaModel.from_pretrained(
-        model_path,
-        device=infini_device,
+        model_path, device=infini_device, dtype=infini_dtype, backend=backend
     )
 
     # ---------------------------------------------------------------------------- #
@@ -94,17 +83,19 @@ def test(
     model_param_infini = get_model_state_dict(
         model_path,
         device=infini_device,
-        dtype=model.config.dtype,
+        dtype=infini_dtype,
     )
 
-    model.load_state_dict(model_param_infini, strict=True)
+    model.load_state_dict(model_param_infini)
+
+    config = model.config
 
     # ---------------------------------------------------------------------------- #
     #                        创建 tokenizer
     # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-    if "llama" == model.config.model_type:
+    if "llama" == config.model_type:
         backend = getattr(tokenizer, "backend_tokenizer", None)
         target = getattr(backend, "_tokenizer", backend)
         norm = getattr(target, "normalizer", None)
@@ -121,39 +112,32 @@ def test(
                     _dec.Fuse(),
                 ]
             )
-    else:
-        raise ValueError(f"Unsupported model type: {model.config.model_type}")
 
     # ---------------------------------------------------------------------------- #
     #                        token编码
     # ---------------------------------------------------------------------------- #
     # prompt = "山东最高的山是？"
-    if isinstance(prompts, str):
-        prompts = [prompts]
-    input_contents = [
-        tokenizer.apply_chat_template(
-            conversation=[{"role": "user", "content": prompt}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
-        for prompt in prompts
-    ]
-    print(input_contents[0], end="", flush=True)
-    input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-        "input_ids"
-    ]  # List: [[1, 1128, 526, 366, 29892]]
+    input_content = tokenizer.apply_chat_template(
+        conversation=[{"role": "user", "content": prompt}],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    print(input_content, end="", flush=True)
+    input_ids = tokenizer.encode(input_content)
 
     # ---------------------------------------------------------------------------- #
     #                        自回归生成
     # ---------------------------------------------------------------------------- #
+    input_ids_list = [input_ids]  # List: [[1, 1128, 526, 366, 29892]]
     input_ids_infini = infinicore.from_list(input_ids_list)
 
     t1 = time.time()
-    print("=================== start generate ====================")
     model.generate(
         input_ids_infini,
         max_new_tokens=max_new_tokens,
+        device=infini_device,
         tokenizer=tokenizer,
+        config=config,
     )
     t2 = time.time()
 
@@ -184,20 +168,20 @@ def test(
             "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
         )
         sys.exit(1)
-    prompts = [args.prompt for _ in range(args.batch_size)]
+    prompt = "山东最高的山是？"
 
     model_path = args.model_path
     max_new_tokens = args.max_new_tokens
     backend = args.backend
 
-    if backend != "python":
-        raise ValueError(f"Unsupported backend: {backend}.")
-
     infini_device = infinicore.device(device_str, 0)
+    infini_dtype = infinicore.bfloat16
 
     test(
-        prompts,
+        prompt,
         model_path,
         max_new_tokens,
         infini_device=infini_device,
+        infini_dtype=infini_dtype,
+        backend=backend,
     )
diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h
index 0bed7bc7..82802acd 100644
--- a/include/infinicore_infer.h
+++ b/include/infinicore_infer.h
@@ -6,5 +6,6 @@
 
 #include "infinicore_infer/models/deepseek.h"
 #include "infinicore_infer/models/jiuge.h"
+#include "infinicore_infer/models/Qwen3MoE.h"
 
 #endif /* INFINICORE_INFER_H */
diff --git a/include/infinicore_infer/models/Qwen3MoE.h b/include/infinicore_infer/models/Qwen3MoE.h
new file mode 100644
index 00000000..0ea8047c
--- /dev/null
+++ b/include/infinicore_infer/models/Qwen3MoE.h
@@ -0,0 +1,88 @@
+#ifndef QWEN3MOE
+#define QWEN3MOE
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+
+struct Qwen3MoEWeights;
+
+/// @brief 函数指针
+typedef void (*load_global)(Qwen3MoEWeights *, void *cpu_ptr);
+typedef void (*load_layer)(Qwen3MoEWeights *, void *cpu_ptr, size_t layer_id);
+typedef void (*load_layer_linear)(Qwen3MoEWeights *, void *weight_ptr, size_t layer_id);
+/// @brief 权重加载器
+typedef struct {
+    // Pre-Norm
+    load_layer load_attn_norm;
+
+    // Attention
+    load_layer_linear load_attn_q_proj; 
+    load_layer_linear load_attn_k_proj; 
+    load_layer_linear load_attn_v_proj; 
+
+    // QKNorm(RMSNorm)
+    load_layer load_attn_q_norm;
+    load_layer load_attn_k_norm;
+
+    // output linear
+    load_layer_linear load_attn_o_proj;
+
+}Qwen3MoEWeightLoader;
+
+struct Qwen3MoEAttention;
+
+/// @brief 模型参数
+typedef struct {
+    //数据种类 BF16 / FP16
+    infiniDtype_t dtype;
+
+    // Linear args
+    size_t hidden_size;
+    size_t num_heads;
+    size_t num_kv_head; // k_v head GQA广播倍数
+    size_t head_dim;
+
+    // RoPE args
+    float rope_theta;
+    size_t max_seq_len; 
+
+    float rms_norm_eps; //防止除零
+}Qwen3MoEAttentionMeta;
+
+/// ==================== API ====================
+
+/// @brief 创建注意力模块
+__C __export struct Qwen3MoEAttention *
+createQwen3MoEAttention(const Qwen3MoEAttentionMeta *,
+                        const Qwen3MoEWeights *);
+/// @brief 创建权重矩阵
+__C Qwen3MoEWeights *
+createQwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                        infiniDevice_t device,
+                        int ndev,             
+                        const int *dev_ids);  
+/// @brief 创建weight加载器
+__C __export Qwen3MoEWeightLoader *
+createQwen3MoEWeightLoader();
+/// @brief 创建KVCache
+__C __export struct Qwen3Cache *
+createQwen3Cache(const Qwen3MoEAttentionMeta *meta,
+                    size_t batch_size, size_t seq_len);
+/// @brief 前向计算
+__C __export void forwardQwen3MoEAttention(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    const void* input_tensor,
+    void* output_tensor,
+    int batch_size,           // [新增]
+    const int* seq_lens_ptr,  // [新增]
+    const int* past_lens_ptr, // [新增]
+    const int* pos_ids_ptr    // [新增]
+);
+
+/// @brief 销毁模型
+__C __export void destroyQwen3MoEAttention(struct Qwen3MoEAttention* ctx);
+
+
+#endif 
\ No newline at end of file
diff --git a/include/infinicore_infer/models/deepseek.h b/include/infinicore_infer/models/deepseek.h
index 3924c5fe..051637b4 100644
--- a/include/infinicore_infer/models/deepseek.h
+++ b/include/infinicore_infer/models/deepseek.h
@@ -8,6 +8,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+    
 struct DeepSeekV3Weights;
 
 // Function pointer signatures
diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py
index 0fbee2ca..262fd084 100644
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
@@ -1,5 +1,3 @@
 from .models import AutoLlamaModel
-from . import distributed
-from . import cache
 
-__all__ = ["AutoLlamaModel", "distributed", "cache"]
+__all__ = ["AutoLlamaModel"]
diff --git a/python/infinilm/cache_utils.py b/python/infinilm/cache_utils.py
index 3587886b..fbd566e4 100644
--- a/python/infinilm/cache_utils.py
+++ b/python/infinilm/cache_utils.py
@@ -65,12 +65,12 @@ def lazy_initialization(self, key_states: infinicore.Tensor):
             self.max_seq_len = max(self.max_position_embeddings, seq_len)
 
             self.keys = infinicore.empty(
-                (batch_size, self.max_seq_len, num_heads, head_dim),
+                [batch_size, self.max_seq_len, num_heads, head_dim],
                 dtype=dtype,
                 device=device,
             )
             self.values = infinicore.empty(
-                (batch_size, self.max_seq_len, num_heads, head_dim),
+                [batch_size, self.max_seq_len, num_heads, head_dim],
                 dtype=dtype,
                 device=device,
             )
@@ -80,12 +80,12 @@ def lazy_initialization(self, key_states: infinicore.Tensor):
             self.max_seq_len = max(self.max_seq_len * 2, self.cache_position + seq_len)
 
             keys_new = infinicore.empty(
-                (batch_size, self.max_seq_len, num_heads, head_dim),
+                [batch_size, self.max_seq_len, num_heads, head_dim],
                 dtype=dtype,
                 device=device,
             )
             values_new = infinicore.empty(
-                (batch_size, self.max_seq_len, num_heads, head_dim),
+                [batch_size, self.max_seq_len, num_heads, head_dim],
                 dtype=dtype,
                 device=device,
             )
diff --git a/python/infinilm/generation/utils.py b/python/infinilm/generation/utils.py
index 00143231..4da145cd 100644
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
@@ -47,14 +47,18 @@ def _get_initial_position_ids(
         self,
         bs: int,
         seq_length: int,
+        device: infinicore.device,
     ) -> infinicore.Tensor:
         """Calculates `position_ids` for the pre-fill stage"""
         position_ids_list = [list(range(0, seq_length)) for i in range(bs)]
 
-        return infinicore.from_list(position_ids_list, dtype=infinicore.int64)
+        return infinicore.from_list(
+            position_ids_list, dtype=infinicore.int64, device=device
+        )
 
     def prepare_inputs_for_generation(
         self,
+        device: infinicore.device,
         past_key_values: Optional[Cache] = None,
         **kwargs,
     ):
@@ -69,18 +73,18 @@ def prepare_inputs_for_generation(
             model_inputs["past_key_values"] = past_key_values
 
         # -------------------------------------------------------------------------- #
-        #                     计算所需的: position_ids
+        #                     计算所需的，position_ids
         # -------------------------------------------------------------------------- #
         current_position_ids = kwargs.get("position_ids", None)
         if current_position_ids is None:
             # prill阶段
             bs, seq_len = kwargs["input_ids"].shape[0:2]
-            model_inputs["position_ids"] = self._get_initial_position_ids(bs, seq_len)
-            model_inputs["cache_positions"] = infinicore.from_list(
-                [0], dtype=infinicore.int64
+            model_inputs["position_ids"] = self._get_initial_position_ids(
+                bs, seq_len, device
             )
+
         else:
-            # decode 阶段
+            # decoder 阶段
             bs, seq_len = current_position_ids.shape
             last_position = current_position_ids.narrow(1, seq_len - 1, 1)
 
@@ -92,21 +96,13 @@ def prepare_inputs_for_generation(
 
             next_position = one_value + last_position
             model_inputs["position_ids"] = next_position
-            model_inputs["cache_positions"] = kwargs[
-                "cache_positions"
-            ] + infinicore.from_list(
-                [seq_len],
-                dtype=last_position.dtype,
-                device=last_position.device,
-            )
+
         # -------------------------------------------------------------------- #
         #                 所需的: token的input_ids
         # -------------------------------------------------------------------- #
-        if kwargs.get("next_token_ids", None) is not None:
-            next_token_ids = kwargs["next_token_ids"]
-            model_inputs["input_ids"] = infinicore.from_list(
-                [[id_] for id_ in next_token_ids],
-            )
+        if kwargs.get("next_token_id", None) is not None:
+            next_token_id = kwargs["next_token_id"]
+            model_inputs["input_ids"] = infinicore.from_list([[next_token_id]])
 
         # -------------------------------------------------------------------- #
         #                 其他
@@ -121,20 +117,22 @@ def generate(
         self,
         input_ids: infinicore.Tensor,
         max_new_tokens: int,
+        device: infinicore.device,
         tokenizer,
-        stop_on_eos=True,
+        config,
         **kwargs,
     ):
         model_kwargs = kwargs
 
-        # Check if this is a cpp backend model (has _model attribute with reset_cache method)
-        if not (hasattr(self, "_model") and hasattr(self._model, "reset_cache")):
-            if self.use_cache:
-                model_kwargs["use_cache"] = True
-                model_kwargs["past_key_values"] = DynamicCache(config=self.config)
-            else:
-                model_kwargs["use_cache"] = False
-                model_kwargs["past_key_values"] = None
+        # -------------------------------------------------------------------- #
+        #                       创建 cache                                      #
+        # -------------------------------------------------------------------- #
+        if self.use_cache:
+            model_kwargs["use_cache"] = True
+            model_kwargs["past_key_values"] = DynamicCache(config=self.config)
+        else:
+            model_kwargs["use_cache"] = False
+            model_kwargs["past_key_values"] = None
 
         # -------------------------------------------------------------------- #
         #                       _sample函数                                     #
@@ -142,8 +140,9 @@ def generate(
         result = self._sample(
             input_ids,
             max_new_tokens=max_new_tokens,
+            device=device,
             tokenizer=tokenizer,
-            stop_on_eos=stop_on_eos,
+            config=config,
             **model_kwargs,
         )
         return result
@@ -152,8 +151,9 @@ def _sample(
         self,
         input_ids: infinicore.Tensor,
         max_new_tokens: int,
+        device: infinicore.device,
         tokenizer,
-        stop_on_eos=True,
+        config,
         **model_kwargs,
     ):
         r"""
@@ -162,22 +162,17 @@ def _sample(
         Parameters:
             input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
             max_new_tokens: Maximum number of new tokens.
+            device: infinicore.device.
             tokenizer: translating data into raw text.
         """
 
         batch_size, seq_len = input_ids.shape[:2]
 
-        eos_token_id = self.config.eos_token_id
+        eos_token_id = config.eos_token_id
         eos_token_id_list = (
             [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
         )
 
-        # Extract sampling parameters from kwargs with defaults
-        random_val = model_kwargs.get("random_val", 0.1)
-        topp = model_kwargs.get("topp", 0.8)
-        topk = model_kwargs.get("topk", 1)
-        temperature = model_kwargs.get("temperature", 1.0)
-
         # -------------------------------------------------------------------------- #
         #                     初始化 position_ids
         # -------------------------------------------------------------------------- #
@@ -193,17 +188,15 @@ def _sample(
             # -------------------------------------------------------------------------- #
             #                     prepare model inputs
             # -------------------------------------------------------------------------- #
-            start_time = time.time()
-            model_inputs = self.prepare_inputs_for_generation(**model_kwargs)
+            model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
 
             model_kwargs["position_ids"] = model_inputs["position_ids"]
-            model_kwargs["cache_positions"] = model_inputs["cache_positions"]
 
             # -------------------------------------------------------------------------- #
             #                     计算一次
             # -------------------------------------------------------------------------- #
+            start_time = time.time()
             logits = self(**model_inputs)
-            infinicore.sync_device()
 
             # -------------------------------------------------------------------------- #
             #                     处理输出
@@ -220,56 +213,43 @@ def _sample(
                 dtype=infinicore.int32,
                 device=token_scores.device,
             )
-
             for i in range(0, batch_size):
-                score = token_scores.narrow(0, i, 1).view((vocab_size,))
+                score = token_scores.narrow(0, i, 1).view([vocab_size])
                 out = next_tokens.narrow(0, i, 1).view([])
                 infinicore.nn.functional.random_sample(
                     score,
-                    random_val,
-                    topp,
-                    topk,
-                    temperature,
+                    0.8,
+                    0.1,
+                    1,
+                    1.0,
                     out=out,
                 )
 
             infinicore.sync_stream()  # 计算结束前需要同步
+
+            end_time = time.time()
+            time_list.append((end_time - start_time) * 1000)
+
             # ----------------------------------------------------------------- #
             #                得到下一个token的id，并解码为字符
             # ----------------------------------------------------------------- #
             token_id = next_tokens.to_numpy()[0]
             output_str = tokenizer.decode([token_id], skip_special_tokens=True)
 
-            model_kwargs["next_token_ids"] = next_tokens.to_numpy().tolist()
+            model_kwargs["next_token_id"] = token_id
             output_tokens_list.append(token_id)
             output_content += output_str
 
-            end_time = time.time()
-            time_list.append((end_time - start_time))
-
             print(output_str, end="", flush=True)
-            if stop_on_eos and token_id in eos_token_id_list:
+            if token_id in eos_token_id_list:
                 break
+
         print("\n</s>")
-        print(f"\n\n\n Generation completed in {round(sum(time_list) * 1000, 2)} ms")
         print(
-            f" Batchsize={batch_size}  Per_Batch_Input_Len={seq_len}  Per_Batch_New_Tokens={len(time_list)}\n"
+            f"\n\n\n Time per step:  prefill {round(time_list[0], 2)} token/ms\n",
         )
         print(
-            f" Prefill TTFT: {round(time_list[0], 2)}ms  Throughput: {round((batch_size * seq_len) / time_list[0], 2)}tok/s\n",
+            f" Time per step:  decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
         )
-        if len(time_list) > 1:
-            print(
-                f" Decode  Avg ITL: {round(sum(time_list[1:]) * 1000 / (len(time_list) - 1), 2)}ms   Throughput: {round((batch_size * (len(time_list) - 1)) / sum(time_list[1:]), 2)}tok/s\n",
-            )
 
-        return {
-            "output_token_ids": output_tokens_list,
-            "output_content": output_content,
-            "total_latency": sum(time_list),
-            "prefill_latency": time_list[0],
-            "decode_latency": sum(time_list[1:]),
-            "total_input_tokens": batch_size * seq_len,
-            "total_output_tokens": len(time_list),
-        }
         return output_tokens_list, output_content
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index 792aa503..9b1c6c87 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -1,10 +1,10 @@
 import os
-from typing import Dict, Union
-import time
+from typing import Dict, Optional, Union
+
 import torch
 from safetensors import safe_open
 import glob
-from tqdm import tqdm
+
 import infinicore
 
 str_to_torch_dtype = {
@@ -23,41 +23,15 @@
 }
 
 
-def check_parameters(model_keys: list, already_loaded_keys: list):
-    model_keys = set(model_keys)
-    already_loaded_keys = set(already_loaded_keys)
-    intersection = model_keys & already_loaded_keys
-
-    missing_keys = model_keys - intersection
-    unexpected_keys = already_loaded_keys - intersection
-    error_msgs: list[str] = []
-
-    if len(unexpected_keys) > 0:
-        error_msgs.append(
-            "Unexpected key(s) in state_dict: {}. ".format(
-                ", ".join('"{}"'.format(k) for k in unexpected_keys)
-            )
-        )
-    if len(missing_keys) > 0:
-        error_msgs.append(
-            "Missing key(s) in state_dict: {}. ".format(
-                ", ".join('"{}"'.format(k) for k in missing_keys)
-            )
-        )
-
-    if len(error_msgs) > 0:
-        raise RuntimeError(
-            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
-        )
-
-
 def load_state_dict(
-    checkpoint_file: Union[str, os.PathLike], device="cpu", dtype=torch.bfloat16
+    checkpoint_file: Union[str, os.PathLike],
+    map_location: Optional[Union[str, torch.device]] = "cpu",
+    weights_only: bool = True,
 ) -> Dict[str, torch.Tensor]:
     """
     Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
     """
-
+    # Use safetensors if possible
     if not checkpoint_file.endswith(".safetensors"):
         return {}
 
@@ -75,7 +49,20 @@ def load_state_dict(
             )
 
         for k in f.keys():
-            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
+            if map_location == "meta":
+                _slice = f.get_slice(k)
+                k_dtype = _slice.get_dtype()
+                if k_dtype in str_to_torch_dtype:
+                    dtype = str_to_torch_dtype[k_dtype]
+                else:
+                    raise ValueError(
+                        f"Cannot load safetensors of unknown dtype {k_dtype}"
+                    )
+                state_dict[k] = torch.empty(
+                    size=_slice.get_shape(), dtype=dtype, device="meta"
+                )
+            else:
+                state_dict[k] = f.get_tensor(k)
 
     return state_dict
 
@@ -88,144 +75,30 @@ def get_model_state_dict(
     """
     Load the model weights.
     """
-
-    print(" read weights ......")
-    t1 = time.time()
-
-    torch_device = device.type
-    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-
     # --------------------------------------------------------- #
-    #          Load weights from  all *.safetensors files
+    #          使用从 *.safetensors文件中加载权重
     # --------------------------------------------------------- #
     model_param = {}
     for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
-        model_param.update(
-            load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
-        )
+        model_param.update(load_state_dict(file_path))
 
     if model_param.get("lm_head.weight", None) is None:
         model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
 
     # --------------------------------------------------------- #
-    #         model_param_infini references torch.Tensor
+    #          调整权重的device和dtype
     # --------------------------------------------------------- #
-    model_param_infini = {}
-    for key in model_param.keys():
-        model_param_infini[key] = infinicore.from_torch(model_param[key])
-
-    t2 = time.time()
-    print(f" read weights over! {(t2 - t1) * 1000} ms \n")
-    return model_param_infini
-
-
-def load_model_state_dict_by_file(
-    model: infinicore.nn.Module,
-    model_path: str,
-    dtype=infinicore.dtype,
-) -> Dict[str, infinicore.Tensor]:
-    """
-    Load the model weights from file.
-    """
-    print(" load weights ......")
-    t1 = time.time()
-
-    torch_device = "cpu"
-    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-    model_keys = model.state_dict_keyname()
-
-    already_loaded_keys = []
-
-    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
-    if len(file_list) > 0:
-        for file_path in tqdm(file_list, desc="Processing files"):
-            tqdm.write(f"Processing: {os.path.basename(file_path)}")
-
-            # --------------------------------------------------------- #
-            #          Load weights from *.safetensors file
-            # --------------------------------------------------------- #
-            model_param = load_state_dict(
-                file_path, device=torch_device, dtype=torch_dtype
-            )
-            already_loaded_keys.extend(model_param.keys())
-
-            # --------------------------------------------------------- #
-            #         model_param_infini references torch.Tensor
-            # --------------------------------------------------------- #
-            model_param_infini = {}
-            for key in model_param.keys():
-                model_param_infini[key] = infinicore.from_torch(model_param[key])
-
-            model.load_state_dict(model_param_infini, strict=False)
-            infinicore.sync_device()
-
-    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
-        file_path = os.path.join(model_path, "pytorch_model.bin")
-        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
-
-        model_param_infini = {}
-        for key in model_params.keys():
-            model_param_infini[key] = infinicore.from_torch(
-                model_params[key].to(dtype=torch_dtype)
-            )
-
-            already_loaded_keys.append(key)
-
-        model.load_state_dict(model_param_infini, strict=True)
-        infinicore.sync_device()
-    else:
-        raise KeyError("Weight file not found.")
-
-    check_parameters(model_keys, already_loaded_keys)
-
-    t2 = time.time()
-    print(f" load weights over! {(t2 - t1) * 1000} ms \n")
-
-
-def load_model_state_dict_by_tensor(
-    model: infinicore.nn.Module,
-    model_path: str,
-    dtype=infinicore.dtype,
-):
-    """
-    Load the model weights by tensor.
-    """
-
-    print(" load weights ......")
-    t1 = time.time()
-
+    torch_device = device.type
     torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-    model_keys = model.state_dict_keyname()
-    already_loaded_keys = []
-
-    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
-    if len(file_list) > 0:
-        for file_path in tqdm(file_list, desc="Processing files"):
-            tqdm.write(f"Processing: {os.path.basename(file_path)}")
-
-            with safe_open(file_path, "pt", "cpu") as f:
-                for name in f.keys():
-                    weight_infini = infinicore.from_torch(
-                        f.get_tensor(name).to(dtype=torch_dtype)
-                    )
-                    model.load_param(name, weight_infini)
-                    already_loaded_keys.append(name)
-                    infinicore.sync_stream()
 
-    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
-        file_path = os.path.join(model_path, "pytorch_model.bin")
-        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
-
-        for key in model_params.keys():
-            weight_infini = infinicore.from_torch(
-                model_params[key].to(dtype=torch_dtype)
-            )
-            model.load_param(key, weight_infini)
-            already_loaded_keys.append(key)
-    else:
-        raise KeyError("Weight file not found.")
+    model_param_infini = {}
+    for key, value in model_param.items():
+        model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
 
-    check_parameters(model_keys, already_loaded_keys)
+    # --------------------------------------------------------- #
+    #           model_param_infini 引用torch.Tensor
+    # --------------------------------------------------------- #
+    for key, value in model_param.items():
+        model_param_infini[key] = infinicore.from_torch(model_param[key])
 
-    t2 = time.time()
-    print(f" load weights over! {(t2 - t1) * 1000} ms \n")
+    return model_param_infini
diff --git a/python/infinilm/models/llama/__init__.py b/python/infinilm/models/llama/__init__.py
index f3b4bbd4..872f657c 100644
--- a/python/infinilm/models/llama/__init__.py
+++ b/python/infinilm/models/llama/__init__.py
@@ -1,8 +1,6 @@
 import os
 from typing import Optional, Union
 import infinicore
-import time
-from . import modeling_llama
 
 __all__ = ["AutoLlamaModel"]
 
@@ -14,23 +12,24 @@ def from_pretrained(
         model_path: Optional[Union[str, os.PathLike]],
         device: infinicore.device,
         dtype=infinicore.dtype,
-        **kwargs,
+        backend="python",
     ):
-        t1 = time.time()
+        if backend == "python":
+            from . import modeling_llama
 
-        print("\n***************************************************************")
-        print("\t Loading Llama Model")
-        print(f"\t Device: {device}, DType: {dtype}")
-        print("***************************************************************\n")
-        print(" create model ......")
+            return modeling_llama.LlamaForCausalLM.from_pretrained(
+                model_path,
+                device=device,
+                dtype=dtype,
+            )
 
-        instance = modeling_llama.LlamaForCausalLM.from_pretrained(
-            model_path,
-            device=device,
-            **kwargs,
-        )
+        elif backend == "cpp":
+            from .backends import cpp
 
-        t2 = time.time()
-        print(f" create model over! {(t2 - t1) * 1000} ms \n")
+            return cpp.LlamaForCausalLM.from_pretrained(
+                model_path,
+                device=device,
+                dtype=dtype,
+            )
 
-        return instance
+        raise KeyError("invalid backend")
diff --git a/python/infinilm/models/llama/backends/cpp.py b/python/infinilm/models/llama/backends/cpp.py
new file mode 100644
index 00000000..30b56192
--- /dev/null
+++ b/python/infinilm/models/llama/backends/cpp.py
@@ -0,0 +1,38 @@
+from ....generation.utils import GenerationMixin
+import infinicore
+import os
+from typing import Optional, Union
+
+
+class LlamaForCausalLM(GenerationMixin):
+    def __init__(self):
+        super().__init__()
+        self.use_cache = False
+        self._model = None
+        raise NotImplementedError("NotImplementedError!!")
+
+    def forward(self, input_ids, position_ids, *args, **kwargs):
+        kv_caches = None
+        return infinicore.Tensor(
+            self._model.forward(input_ids, position_ids, kv_caches)
+        )
+
+    def __call__(self, input_ids, position_ids, *args, **kwargs):
+        return self.forward(input_ids=input_ids, position_ids=position_ids)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: Union[str, os.PathLike],
+        device: infinicore.device,
+        dtype=infinicore.dtype,
+    ):
+        """
+        Load a pretrained LlamaForCausalLM model from a directory.
+        Args:
+            model_path: Path to the model directory containing config.json
+            device: Device instance (defaults to CPU)
+        Returns:
+            LlamaForCausalLM instance
+        """
+        raise NotImplementedError("NotImplementedError!!")
diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py
index abc349c7..12eec8dd 100644
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
@@ -15,14 +15,10 @@
 
 """LLaMA model configuration"""
 
-import infinicore
-
-from infinilm.lib import _infinilm
-
 from ...configuration_utils import PretrainedConfig
 
 
-class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
+class LlamaConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -170,22 +166,19 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=True,
-        pad_token_id=-1,
+        pad_token_id=None,
         bos_token_id=1,
         eos_token_id=2,
         pretraining_tp=1,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
-        attention_bias=True,
+        attention_bias=False,
         attention_dropout=0.0,
         mlp_bias=False,
         head_dim=None,
-        torch_dtype=None,
         **kwargs,
     ):
-        _infinilm.LlamaConfig.__init__(self)
-
         # ---
         self.model_type = "llama"
         self.name_or_path = ""
@@ -228,13 +221,7 @@ def __init__(
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         # rope_config_validation(self)
 
-        if torch_dtype in {"float32", "bfloat16", "float16"}:
-            self.dtype = getattr(infinicore, torch_dtype)
-            self._dtype = self.dtype._underlying
-        else:
-            raise ValueError(f"Unsupported dtype: {torch_dtype}")
-
-        PretrainedConfig.__init__(
+        super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
diff --git a/python/infinilm/models/llama/modeling_llama.py b/python/infinilm/models/llama/modeling_llama.py
index 5b6d9da7..8c91aa39 100644
--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
@@ -17,6 +17,7 @@
 import os
 from typing import Optional, Union
 
+from transformers.utils import logging
 
 import infinicore
 
@@ -24,6 +25,8 @@
 from ...generation.utils import GenerationMixin
 from .configuration_llama import LlamaConfig
 
+logger = logging.get_logger(__name__)
+
 
 def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
     total_seq_len, num_heads, head_dim = keys.shape
@@ -46,7 +49,7 @@ def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
 
 def multi_head_attention(
     querys: infinicore.Tensor,  # [seq_len,       num_heads, head_dim]
-    keys: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
+    keys: infinicore.Tensor,  #   [total_seq_len, num_heads, head_dim]
     values: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
     scaling: float,
 ):
@@ -59,8 +62,13 @@ def multi_head_attention(
 
     # [num_heads, seq_len, head_dim] @ [ num_heads, head_dim, total_seq_len]
     # => [ num_heads, seq_len, total_seq_len]
-    # Q @ K.T *scaling
-    attn_weight = infinicore.matmul(Q, K.permute((1, 2, 0)), alpha=scaling)
+    attn_weight = Q @ K.permute((1, 2, 0))
+
+    scaling = infinicore.from_list(
+        [scaling], dtype=attn_weight.dtype, device=attn_weight.device
+    ).as_strided(attn_weight.shape, [0, 0, 0])
+
+    attn_weight = attn_weight * scaling
 
     infinicore.nn.functional.causal_softmax(attn_weight, out=attn_weight)
 
@@ -73,11 +81,9 @@ def multi_head_attention(
 
 
 def grouped_query_attention(
-    # [seq_len,       num_attention_heads, head_dim]
-    querys: infinicore.Tensor,
-    keys: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
-    # [total_seq_len, num_key_value_heads, head_dim]
-    values: infinicore.Tensor,
+    querys: infinicore.Tensor,  # [seq_len,       num_attention_heads, head_dim]
+    keys: infinicore.Tensor,  #   [total_seq_len, num_key_value_heads, head_dim]
+    values: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
     scaling: float,
 ):
     num_attention_heads = querys.shape[1]
@@ -98,16 +104,15 @@ def __init__(self, config, **kwargs):
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
         mlp_bias = config.mlp_bias
-        dtype = config.dtype
 
         self.gate_proj = infinicore.nn.Linear(
-            hidden_size, intermediate_size, bias=mlp_bias, dtype=dtype, **kwargs
+            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
         )
         self.up_proj = infinicore.nn.Linear(
-            hidden_size, intermediate_size, bias=mlp_bias, dtype=dtype, **kwargs
+            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
         )
         self.down_proj = infinicore.nn.Linear(
-            intermediate_size, hidden_size, bias=mlp_bias, dtype=dtype, **kwargs
+            intermediate_size, hidden_size, bias=mlp_bias, **kwargs
         )
         self.act_fn = infinicore.nn.functional.silu
 
@@ -134,13 +139,10 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
 
         self.scaling = self.head_dim**-0.5
 
-        dtype = config.dtype
-
         self.q_proj = infinicore.nn.Linear(
             self.hidden_size,
             self.num_attention_heads * self.head_dim,
             bias=attention_bias,
-            dtype=dtype,
             **kwargs,
         )
 
@@ -148,7 +150,6 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
             self.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=attention_bias,
-            dtype=dtype,
             **kwargs,
         )
 
@@ -156,20 +157,16 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
             self.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=attention_bias,
-            dtype=dtype,
             **kwargs,
         )
 
         self.o_proj = infinicore.nn.Linear(
             self.num_attention_heads * self.head_dim,
             self.hidden_size,
-            bias=False,
-            dtype=dtype,
+            bias=attention_bias,
             **kwargs,
         )
 
-        self.attn_output = None  # Variable reuse
-
     def forward(
         self,
         hidden_states: infinicore.Tensor,
@@ -178,14 +175,14 @@ def forward(
         **kwargs,
     ) -> infinicore.Tensor:
         hidden_states_shape = hidden_states.shape  # [bs, seq_len, hidden_size]
-        bs, seq_len = hidden_states_shape[:-1]  # [bs, seq_len]
+        bs, seq_len = hidden_states_shape[:-1]  #    [bs, seq_len]
 
         querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
         keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
         values_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
 
         # --------------------------------------------------------------------------------------- #
-        #                           对 Q,K,V进行 project
+        #                           对 Q,K，V进行 project
         # --------------------------------------------------------------------------------------- #
         # => [bs, seq_len,  num_attention_heads, head_dim]
         query_states = self.q_proj(hidden_states).view(querys_shape)
@@ -197,9 +194,13 @@ def forward(
         value_states = self.v_proj(hidden_states).view(values_shape)
 
         # --------------------------------------------------------------------------------------- #
-        #                           对 Q和K 加上 rope
+        #                           对 Q和K， 加上 rope
         # --------------------------------------------------------------------------------------- #
         position_ids = kwargs.pop("position_ids", None)
+        if position_ids is None:
+            raise KeyError("position_ids error")
+        if rope_instance is None:
+            raise KeyError("rope_instance error")
 
         query_states = rope_instance(query_states, position_ids)
         key_states = rope_instance(key_states, position_ids)
@@ -220,14 +221,7 @@ def forward(
         #                           注意力计算
         # --------------------------------------------------------------------------------------- #
         total_seq_len = key_states_total.shape[1]
-
-        if self.attn_output is None or self.attn_output.shape[1] != seq_len:
-            self.attn_output = infinicore.empty(
-                (bs, seq_len, self.num_attention_heads, self.head_dim),
-                dtype=query_states.dtype,
-                device=query_states.device,
-            )
-
+        attn_output = infinicore.empty_like(query_states)
         for i in range(0, bs):
             query_states_i = query_states.narrow(0, i, 1).view(
                 (seq_len, self.num_attention_heads, self.head_dim)
@@ -239,7 +233,7 @@ def forward(
                 (total_seq_len, self.num_key_value_heads, self.head_dim)
             )
 
-            attn_output_i = self.attn_output.narrow(0, i, 1).view(
+            attn_output_i = attn_output.narrow(0, i, 1).view(
                 (seq_len, self.num_attention_heads, self.head_dim)
             )
 
@@ -253,9 +247,8 @@ def forward(
         #                           out project
         # --------------------------------------------------------------------------------------- #
         # ([bs, seq_len, num_attention_heads, head_dim]) ==> [bs, seq_len, hidden_size ]
-        attn_output = self.attn_output.view(
-            (bs, seq_len, self.num_attention_heads * self.head_dim)
-        )
+        attn_output = attn_output.view(hidden_states_shape)
+
         # o_proj
         return self.o_proj(attn_output)
 
@@ -265,16 +258,13 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
         super().__init__()
         hidden_size = config.hidden_size
         rms_norm_eps = config.rms_norm_eps
-        dtype = config.dtype
 
         self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx, **kwargs)
         self.mlp = LlamaMLP(config=config, **kwargs)
 
-        self.input_layernorm = LlamaRMSNorm(
-            hidden_size, eps=rms_norm_eps, dtype=dtype, **kwargs
-        )
+        self.input_layernorm = LlamaRMSNorm(hidden_size, eps=rms_norm_eps, **kwargs)
         self.post_attention_layernorm = LlamaRMSNorm(
-            hidden_size, eps=rms_norm_eps, dtype=dtype, **kwargs
+            hidden_size, eps=rms_norm_eps, **kwargs
         )
 
     def forward(
@@ -300,7 +290,7 @@ def forward(
             **kwargs,
         )
 
-        hidden_states += residual
+        hidden_states = residual + hidden_states
 
         # ------------------------------------------------ #
         #           Fully Connected
@@ -311,7 +301,7 @@ def forward(
 
         hidden_states = self.mlp(hidden_states)
 
-        hidden_states += residual
+        hidden_states = residual + hidden_states
 
         return hidden_states
 
@@ -327,7 +317,7 @@ def __init__(self, config: LlamaConfig, **kwargs):
         )
 
         self.embed_tokens = infinicore.nn.Embedding(
-            config.vocab_size, config.hidden_size, dtype=config.dtype, **kwargs
+            config.vocab_size, config.hidden_size, **kwargs
         )
 
         self.layers = infinicore.nn.ModuleList(
@@ -336,15 +326,12 @@ def __init__(self, config: LlamaConfig, **kwargs):
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
-        self.norm = LlamaRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps, dtype=config.dtype, **kwargs
-        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps, **kwargs)
 
         self.rope_instance = infinicore.nn.RoPE(
             max_position_embeddings=config.max_position_embeddings,
             rope_theta=config.rope_theta,
             head_dim=head_dim,
-            dtype=config.dtype,
             **kwargs,
         )
 
@@ -386,10 +373,7 @@ def forward(
         #                    norm
         # --------------------------------------------------------- #
         seq_len = hidden_states.shape[1]
-        if seq_len > 1:
-            last_token = hidden_states.narrow(1, seq_len - 1, 1)
-        else:
-            last_token = hidden_states
+        last_token = hidden_states.narrow(1, seq_len - 1, 1)
 
         return self.norm(last_token)
 
@@ -407,10 +391,8 @@ def __init__(self, config, **kwargs):
             config.hidden_size,
             config.vocab_size,
             bias=False,
-            dtype=config.dtype,
             **kwargs,
         )
-        self.device = kwargs.get("device", infinicore.device("cpu"))
 
     def forward(
         self,
@@ -422,7 +404,7 @@ def forward(
     ):
         last_token = self.model(
             input_ids,
-            position_ids.to(self.device),
+            position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             **kwargs,
@@ -434,6 +416,7 @@ def from_pretrained(
         cls,
         model_path: Optional[Union[str, os.PathLike]],
         device: infinicore.device,
+        dtype=infinicore.dtype,
     ):
         def load_config_json(dir_path_: str):
             with open(os.path.join(dir_path_, "config.json"), "r") as f:
@@ -443,4 +426,4 @@ def load_config_json(dir_path_: str):
         config_dict = load_config_json(os.path.join(model_path))
         config = LlamaConfig(**config_dict)
 
-        return LlamaForCausalLM(config, device=device)
+        return LlamaForCausalLM(config, device=device, dtype=dtype)
diff --git a/scripts/jiuge.py b/scripts/jiuge.py
index e50ea327..7c31baf8 100644
--- a/scripts/jiuge.py
+++ b/scripts/jiuge.py
@@ -825,7 +825,7 @@ def destroy_model_instance(self):
 def test():
     if len(sys.argv) < 3:
         print(
-            "Usage: python jiuge.py [--cpu | --nvidia| --qy| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
+            "Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
         )
         sys.exit(1)
 
@@ -844,8 +844,6 @@ def test():
         device_type = DeviceType.DEVICE_TYPE_CPU
     elif sys.argv[1] == "--nvidia":
         device_type = DeviceType.DEVICE_TYPE_NVIDIA
-    elif sys.argv[1] == "--qy":
-        device_type = DeviceType.DEVICE_TYPE_QY
     elif sys.argv[1] == "--cambricon":
         device_type = DeviceType.DEVICE_TYPE_CAMBRICON
     elif sys.argv[1] == "--ascend":
@@ -862,7 +860,7 @@ def test():
         device_type = DeviceType.DEVICE_TYPE_HYGON
     else:
         print(
-            "Usage: python jiuge.py [--cpu | --nvidia| --qy| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
+            "Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
         )
         sys.exit(1)
 
diff --git a/scripts/jiuge_ppl.py b/scripts/jiuge_ppl.py
index 923d209c..061ab303 100644
--- a/scripts/jiuge_ppl.py
+++ b/scripts/jiuge_ppl.py
@@ -7,7 +7,6 @@
 DEVICE_TYPE_MAP = {
     "cpu": DeviceType.DEVICE_TYPE_CPU,
     "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
-    "qy": DeviceType.DEVICE_TYPE_QY,
     "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
     "ascend": DeviceType.DEVICE_TYPE_ASCEND,
     "metax": DeviceType.DEVICE_TYPE_METAX,
@@ -20,7 +19,6 @@
 TORCH_DEVICE_TYPE_MAP = {
     "cpu": "cpu",
     "nvidia": "cuda",
-    "qy": "cuda",
     "cambricon": "mlu",
     "ascend": "npu",
     "metax": "cuda",
diff --git a/scripts/launch_server.py b/scripts/launch_server.py
index 659163c6..2d231b49 100644
--- a/scripts/launch_server.py
+++ b/scripts/launch_server.py
@@ -20,7 +20,6 @@
 DEVICE_TYPE_MAP = {
     "cpu": DeviceType.DEVICE_TYPE_CPU,
     "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
-    "qy": DeviceType.DEVICE_TYPE_QY,
     "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
     "ascend": DeviceType.DEVICE_TYPE_ASCEND,
     "metax": DeviceType.DEVICE_TYPE_METAX,
diff --git a/scripts/libinfinicore_infer/__init__.py b/scripts/libinfinicore_infer/__init__.py
index 8fc5f4db..66feee7f 100644
--- a/scripts/libinfinicore_infer/__init__.py
+++ b/scripts/libinfinicore_infer/__init__.py
@@ -8,6 +8,14 @@
     DeepSeekV3WeightLoaderCStruct,
     DeepSeekV3CacheCStruct,
 )
+from .qwen3_moe import (
+    Qwen3MoEModel,
+    Qwen3MoEAttentionMetaCStruct,
+    Qwen3MoEWeightsCStruct,
+    Qwen3MoEWeightLoaderCStruct,
+    Qwen3MoEAttentionCStruct,
+    Qwen3CacheCStruct,
+)
 
 __all__ = [
     "DataType",
@@ -23,5 +31,11 @@
     "DeepSeekV3MetaCStruct",
     "DeepSeekV3WeightsCStruct",
     "DeepSeekV3WeightLoaderCStruct",
+    "Qwen3MoEModel",
+    "Qwen3MoEAttentionMetaCStruct",
+    "Qwen3MoEWeightsCStruct",
+    "Qwen3MoEWeightLoaderCStruct",
+    "Qwen3MoEAttentionCStruct",
+    "Qwen3CacheCStruct",
     "ModelRegister",
 ]
diff --git a/scripts/libinfinicore_infer/base.py b/scripts/libinfinicore_infer/base.py
index 3305cdba..bed65b2e 100644
--- a/scripts/libinfinicore_infer/base.py
+++ b/scripts/libinfinicore_infer/base.py
@@ -36,7 +36,6 @@ class DeviceType(ctypes.c_int):
     DEVICE_TYPE_ILUVATAR = 6
     DEVICE_TYPE_KUNLUN = 7
     DEVICE_TYPE_HYGON = 8
-    DEVICE_TYPE_QY = 9
 
 
 class KVCacheCStruct(ctypes.Structure):
diff --git a/scripts/libinfinicore_infer/qwen3_moe.py b/scripts/libinfinicore_infer/qwen3_moe.py
new file mode 100644
index 00000000..2d7c6393
--- /dev/null
+++ b/scripts/libinfinicore_infer/qwen3_moe.py
@@ -0,0 +1,115 @@
+from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model
+from ctypes import (
+    c_size_t,
+    c_uint,
+    c_int,
+    c_float,
+    c_void_p,
+    POINTER,
+    Structure,
+    CFUNCTYPE,
+)
+
+
+class Qwen3MoEAttentionMetaCStruct(Structure):
+    _fields_ = [
+        ("dtype", DataType),
+        ("hidden_size", c_size_t),
+        ("num_heads", c_size_t),
+        ("num_kv_head", c_size_t),
+        ("head_dim", c_size_t),
+        ("rope_theta", c_float),
+        ("max_seq_len", c_size_t),
+        ("rms_norm_eps", c_float),
+    ]
+
+
+class Qwen3MoEWeightsCStruct(Structure):
+    pass
+
+
+class Qwen3MoEAttentionCStruct(Structure):
+    pass
+
+
+class Qwen3CacheCStruct(Structure):
+    pass
+
+
+load_layer_fn = CFUNCTYPE(None, POINTER(Qwen3MoEWeightsCStruct), c_void_p, c_size_t)
+load_layer_linear_fn = CFUNCTYPE(
+    None, POINTER(Qwen3MoEWeightsCStruct), c_void_p, c_void_p, c_void_p, c_size_t
+)
+
+
+class Qwen3MoEWeightLoaderCStruct(Structure):
+    _fields_ = [
+        ("load_attn_norm", load_layer_fn),
+        ("load_attn_q_proj", load_layer_linear_fn),
+        ("load_attn_k_proj", load_layer_linear_fn),
+        ("load_attn_v_proj", load_layer_linear_fn),
+        ("load_attn_q_norm", load_layer_fn),
+        ("load_attn_k_norm", load_layer_fn),
+        ("load_attn_o_proj", load_layer_linear_fn),
+    ]
+
+
+@register_model
+class Qwen3MoEModel(BaseModel):
+    @classmethod
+    def register_lib(cls, lib):
+        """Register Qwen3MoE model functions with the library"""
+        lib.createQwen3MoEWeightLoader.argtypes = []
+        lib.createQwen3MoEWeightLoader.restype = POINTER(
+            Qwen3MoEWeightLoaderCStruct
+        )
+
+        lib.createQwen3MoEWeights.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            DeviceType,
+            c_int,
+            POINTER(c_int),
+        ]
+        lib.createQwen3MoEWeights.restype = POINTER(Qwen3MoEWeightsCStruct)
+
+        lib.createQwen3MoEAttention.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            POINTER(Qwen3MoEWeightsCStruct),
+        ]
+        lib.createQwen3MoEAttention.restype = POINTER(Qwen3MoEAttentionCStruct)
+
+        lib.destroyQwen3MoEAttention.argtypes = [POINTER(Qwen3MoEAttentionCStruct)]
+
+        lib.createQwen3Cache.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            c_size_t,
+            c_size_t,
+        ]
+        lib.createQwen3Cache.restype = POINTER(Qwen3CacheCStruct)
+
+        lib.forwardQwen3MoEAttention.argtypes = [
+            POINTER(Qwen3MoEAttentionCStruct),
+            POINTER(Qwen3CacheCStruct),
+            c_void_p,
+            c_void_p,
+        ]
+
+    def create_weight_loader(self):
+        return self.lib.createQwen3MoEWeightLoader()
+
+    def create_weights(self, meta, device_type, ndev, dev_ids):
+        return self.lib.createQwen3MoEWeights(meta, device_type, ndev, dev_ids)
+
+    def create_model(self, meta, weights):
+        return self.lib.createQwen3MoEAttention(meta, weights)
+
+    def destroy_model(self, model):
+        self.lib.destroyQwen3MoEAttention(model)
+
+    def create_cache(self, meta, batch_size, seq_len):
+        return self.lib.createQwen3Cache(meta, batch_size, seq_len)
+
+    def forward_attention(self, model, kv_cache, input_tensor, output_tensor):
+        self.lib.forwardQwen3MoEAttention(model, kv_cache, input_tensor, output_tensor)
+
+
diff --git a/src/models/Qwen3MoE/Qwen3MoE.cpp b/src/models/Qwen3MoE/Qwen3MoE.cpp
new file mode 100644
index 00000000..5aa9ed45
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE.cpp
@@ -0,0 +1,606 @@
+#include "Qwen3MoE_impl.hpp"
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "../inference_context.hpp"
+#include "infinicore_infer.h"
+#include <random>
+#include <thread>
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <algorithm>
+#include <numeric>
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+
+extern "C" void launch_fill_zero(void* data, size_t n_bytes, void* stream);
+extern "C" void launch_prefill_softmax(
+    void* data, 
+    int total_rows,      // NumHeads * CurSeqLen
+    int padded_len,      // Stride
+    int total_seq_len,   // Past + Cur
+    int cur_seq_len,     // Cur
+    int head_num, 
+    void* stream
+);
+extern "C" void launch_decode_softmax(void* data, int rows, int cols, int stride, void* stream);
+
+// =============================================================================
+// Helper Declarations & Utils
+// =============================================================================
+
+void createDeviceResource(Qwen3MoEDeviceResource *rsrc, 
+    const Qwen3MoEAttentionMeta *meta,
+    std::shared_ptr<Qwen3DeviceWeights> weights, 
+    infiniDevice_t device, int idev,
+    int ndev, int dev_id,
+    infinicclComm_t comm) {
+
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    RUN_INFINI(infinirtStreamSynchronize(weights->load_stream));
+
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream); 
+
+    auto memory_pool = std::make_shared<MemoryPool>();
+
+    *rsrc = Qwen3MoEDeviceResource{
+    device,
+    dev_id,
+    handle,
+    weights,
+    stream,
+    comm,
+    memory_pool,
+    };
+
+    RUN_INFINI(infinirtDeviceSynchronize());
+}
+
+void releaseDeviceResource(Qwen3MoEDeviceResource &res) {
+    infinirtDeviceSynchronize();
+    res.weights.reset();
+    if (res.handle) { infiniopDestroyHandle(res.handle); res.handle = nullptr; }
+    if (res.stream) { infinirtStreamDestroy(res.stream); res.stream = nullptr; }
+    if (res.comm) { infinicclCommDestroy(res.comm); res.comm = nullptr; }
+}
+
+// =============================================================================
+// Inference Logic
+// =============================================================================
+
+// Qwen3MoE.cpp
+
+void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
+    Qwen3MoEDeviceResource &rsrc,
+    std::shared_ptr<Tensor> input_hidden_states,
+    std::shared_ptr<Tensor> pos_ids,
+    std::shared_ptr<Tensor> output_tensor,
+    Qwen3Cache *kv_cache,
+    size_t layer_id,
+    int batch_size,
+    const std::vector<int>& _seq_lens, 
+    const std::vector<int>& _past_lens
+) {
+    infiniopHandle_t handle = rsrc.handle;
+    infinirtStream_t stream = rsrc.stream;
+    auto memory_pool = rsrc.memory_pool;
+    auto dt_logits = meta.dtype;
+
+    const auto &layer_weight = rsrc.weights->w_layers[layer_id];
+    const auto &attn_weight = layer_weight.self_attn;
+
+    // [FINAL TRUTH] Based on weight shape [4096, 2048]
+    size_t num_heads = 32;     
+    size_t num_kv_head = 4;    
+    size_t head_dim = 128;     
+    size_t ngroup = num_heads / num_kv_head; // 8
+
+    auto input_shape = input_hidden_states->shape();
+    size_t ntok = input_shape[0];
+
+    std::vector<int> seq_lens = _seq_lens;
+    std::vector<int> past_lens = _past_lens;
+    std::vector<int> cpu_pos_ids(ntok);
+
+    RUN_INFINI(infinirtMemcpyAsync(cpu_pos_ids.data(), pos_ids->data(), ntok * sizeof(int), INFINIRT_MEMCPY_D2H, stream));
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+
+    size_t pos_offset = 0;
+    for (int b = 0; b < batch_size; ++b) {
+    int current_pos = cpu_pos_ids[pos_offset];
+    if (past_lens[b] == 0 && current_pos > 0) {
+        past_lens[b] = current_pos;
+        }
+        pos_offset += seq_lens[b];
+        }
+
+    CacheManager cache_manager(100);
+    InferenceContext ctx(handle, memory_pool, &cache_manager, stream);
+    setInferenceContext(&ctx);
+
+    // Alloc Buffers (Full 128-dim size)
+    // Q: 32 * 128 = 4096
+    // K/V: 4 * 128 = 512
+    auto q_buf = Tensor::buffer(dt_logits, {ntok, num_heads * head_dim}, memory_pool);
+    auto k_buf = Tensor::buffer(dt_logits, {ntok, num_kv_head * head_dim}, memory_pool);
+    auto v_buf = Tensor::buffer(dt_logits, {ntok, num_kv_head * head_dim}, memory_pool);
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, num_heads * head_dim}, memory_pool);
+
+    // Step 1: Projections
+    linear(q_buf, input_hidden_states, attn_weight->q_proj, 1.f, 0.f, nullptr, nullptr);
+    linear(k_buf, input_hidden_states, attn_weight->k_proj, 1.f, 0.f, nullptr, nullptr);
+    linear(v_buf, input_hidden_states, attn_weight->v_proj, 1.f, 0.f, nullptr, nullptr);
+
+    int check_pos_id = 64;
+    size_t half_dim = 64; // head_dim / 2
+    std::vector<unsigned short> h_cos_row(half_dim);
+    
+    // Offset = row * row_stride (half_dim elements)
+    size_t cos_offset = check_pos_id * half_dim;
+    
+    // Assuming cos_table is BF16
+    RUN_INFINI(infinirtMemcpyAsync(h_cos_row.data(), 
+                    (char*)rsrc.weights->cos_table->data() + cos_offset * sizeof(unsigned short), 
+                    half_dim * sizeof(unsigned short), 
+                    INFINIRT_MEMCPY_D2H, 
+                    stream));
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+
+    // Step 2: QK Norm (128-dim)
+    {
+        auto q_norm_view = q_buf->view({ntok, num_heads, head_dim});
+        auto k_norm_view = k_buf->view({ntok, num_kv_head, head_dim});
+
+    if (rsrc.weights->w_layers[layer_id].self_attn->q_norm) {
+        rmsnorm(q_norm_view, q_norm_view, rsrc.weights->w_layers[layer_id].self_attn->q_norm, 1e-6);
+    }
+    if (rsrc.weights->w_layers[layer_id].self_attn->k_norm) {
+        rmsnorm(k_norm_view, k_norm_view, rsrc.weights->w_layers[layer_id].self_attn->k_norm, 1e-6);
+    }
+    }
+    
+    // Step 3: RoPE (128-dim)
+    {
+        auto q_rope = q_buf->view({ntok, num_heads, head_dim});
+        auto k_rope = k_buf->view({ntok, num_kv_head, head_dim});
+        
+        rope_v2(q_rope, q_rope, pos_ids, rsrc.weights->cos_table, rsrc.weights->sin_table);
+        rope_v2(k_rope, k_rope, pos_ids, rsrc.weights->cos_table, rsrc.weights->sin_table);
+    }
+
+    // =========================================================
+    // Step 4: KV Cache Setup & Batch Loop
+    // =========================================================
+        
+        // 1. KV Cache Initialization
+        if (kv_cache->layers.size() <= layer_id) {
+            kv_cache->layers.resize(layer_id + 1);
+        }
+        auto &kv_cache_layer = kv_cache->layers[layer_id];
+        size_t max_seq_len = meta.max_seq_len;
+        
+        // [RESTORED STANDARD LOGIC]
+        // 只有当指针为空，或者形状不匹配时，才重新分配！
+        // 这样才能保留 inject_cache 注入的数据
+        bool need_alloc = false;
+        if (!kv_cache_layer.first || !kv_cache_layer.second) {
+            need_alloc = true;
+        } else {
+            auto s = kv_cache_layer.first->shape();
+            if (s[0] < static_cast<size_t>(batch_size) || 
+                s[1] != num_kv_head || 
+                s[2] != max_seq_len || 
+                s[3] != head_dim) { 
+                need_alloc = true;
+            }
+        }
+        size_t unit_size = dsize(dt_logits);
+        if (need_alloc) {
+            // 只有第一次（或Batch变大时）才进来
+            kv_cache_layer.first = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
+            kv_cache_layer.second = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
+            
+            //  先 forward 一次 warmup 分配内存 -> 然后 inject -> 然后正式 run）
+            size_t unit_size = dsize(dt_logits);
+            size_t num_elements = static_cast<size_t>(batch_size) * num_kv_head * max_seq_len * head_dim;
+            cudaMemsetAsync(kv_cache_layer.first->data(), 0, num_elements * unit_size, (cudaStream_t)stream);
+            cudaMemsetAsync(kv_cache_layer.second->data(), 0, num_elements * unit_size, (cudaStream_t)stream);
+        } //TODO: 把cudaMemsetAsync 0 改为launch fill zero （但是会出现精度问题？）
+
+        auto k_cache_all = kv_cache_layer.first;
+        auto v_cache_all = kv_cache_layer.second;
+
+
+    char* k_cache_base = (char*)k_cache_all->data();
+    char* v_cache_base = (char*)v_cache_all->data();
+
+    size_t stride_seq_bytes   = head_dim * unit_size;
+    size_t stride_head_bytes  = max_seq_len * stride_seq_bytes;
+    size_t stride_batch_bytes = num_kv_head * stride_head_bytes;
+
+    size_t token_offset = 0; 
+
+    for (int b = 0; b < batch_size; ++b) {
+    size_t cur_seq_len = static_cast<size_t>(seq_lens[b]);
+    size_t cur_past_len = static_cast<size_t>(past_lens[b]);
+    size_t total_len = cur_past_len + cur_seq_len;
+
+    // --- Cache Update ---
+    char* k_src_batch_ptr = (char*)k_buf->data() + token_offset * num_kv_head * head_dim * unit_size;
+    char* v_src_batch_ptr = (char*)v_buf->data() + token_offset * num_kv_head * head_dim * unit_size;
+    char* k_dst_batch_base = k_cache_base + b * stride_batch_bytes;
+    char* v_dst_batch_base = v_cache_base + b * stride_batch_bytes;
+    size_t kv_token_bytes = head_dim * unit_size; 
+    size_t src_pitch = num_kv_head * head_dim * unit_size; 
+    size_t dst_pitch = head_dim * unit_size; 
+
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* k_s = k_src_batch_ptr + h * kv_token_bytes;
+    char* v_s = v_src_batch_ptr + h * kv_token_bytes;
+    char* k_d = k_dst_batch_base + h * stride_head_bytes + cur_past_len * stride_seq_bytes;
+    char* v_d = v_dst_batch_base + h * stride_head_bytes + cur_past_len * stride_seq_bytes;
+    cudaMemcpy2DAsync(k_d, dst_pitch, k_s, src_pitch, kv_token_bytes, cur_seq_len, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    cudaMemcpy2DAsync(v_d, dst_pitch, v_s, src_pitch, kv_token_bytes, cur_seq_len, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+
+    // --- Attention Compute  ---
+
+    // 1. Prepare Q
+    auto q_transposed = Tensor::buffer(dt_logits, {num_heads, cur_seq_len, head_dim}, memory_pool);
+    auto q_src_view = q_buf->view({ntok, num_heads, head_dim})->slice(0, token_offset, cur_seq_len);
+    for (size_t h = 0; h < num_heads; h++) {
+    auto q_s = q_src_view->slice(1, h, 1)->view({cur_seq_len, head_dim});
+    auto q_d = q_transposed->slice(0, h, 1)->view({cur_seq_len, head_dim});
+    rearrange(q_d, q_s);
+    }
+    auto q_gemm = q_transposed->view({num_kv_head, ngroup * cur_seq_len, head_dim});
+
+    // 2. Prepare K
+    size_t padded_len = (total_len + 31) / 32 * 32; 
+    auto k_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
+    size_t kv_gather_bytes = num_kv_head * padded_len * head_dim * unit_size;
+
+    //  Clear gather buffer
+    cudaMemsetAsync(k_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+
+    char* k_gather_src_base = k_cache_base + b * stride_batch_bytes;
+    size_t gather_bytes_per_head = total_len * head_dim * unit_size;
+    size_t dst_head_stride_bytes = padded_len * head_dim * unit_size;
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* k_src = k_gather_src_base + h * stride_head_bytes;
+    char* k_dst = (char*)k_padded_gather->data() + h * dst_head_stride_bytes;
+    cudaMemcpyAsync(k_dst, (void*)k_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+
+    auto k_gemm_in = Tensor::buffer(dt_logits, {num_kv_head, head_dim, padded_len}, memory_pool);
+    rearrange(k_gemm_in, k_padded_gather->permute({0, 2, 1}));
+
+    // 3. GEMM 1: Q * K
+    auto scores_padded = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, padded_len}, memory_pool);
+
+    float scale_factor = 1.0f / sqrt(128.0f); 
+    linear(scores_padded, q_gemm, k_gemm_in, scale_factor, 0.f, nullptr, nullptr);
+
+    // 4. Softmax+Scaling+Masking (fused_kernel for NVIDIA)
+    // if (cur_seq_len > 1) {
+    // launch_prefill_softmax(scores_padded->data(), num_heads * cur_seq_len, padded_len, total_len, cur_seq_len, num_heads, (void*)stream);
+    // } else {
+    // launch_decode_softmax(scores_padded->data(), num_heads * cur_seq_len, total_len, padded_len, (void*)stream);
+    // }
+    auto scores_view = scores_padded->view({num_heads, cur_seq_len, padded_len});
+    auto scores_in = scores_view->slice(2, 0, total_len);
+    causalSoftmax(scores_in, scores_in);
+
+    if (padded_len > total_len) {
+        size_t pitch = padded_len * unit_size;
+        size_t width = (padded_len - total_len) * unit_size;
+        char* dst_ptr = (char*)scores_padded->data() + total_len * unit_size;
+        cudaMemset2DAsync(dst_ptr, pitch, 0, width, num_heads * cur_seq_len, (cudaStream_t)stream);
+    }
+   
+
+    // 5. GEMM 2
+    auto v_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
+    cudaMemsetAsync(v_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    char* v_gather_src_base = v_cache_base + b * stride_batch_bytes;
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* v_src = v_gather_src_base + h * stride_head_bytes;
+    char* v_dst = (char*)v_padded_gather->data() + h * dst_head_stride_bytes;
+    cudaMemcpyAsync(v_dst, (void*)v_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+
+    auto attn_out_b = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, head_dim}, memory_pool);
+    linear(attn_out_b, scores_padded, v_padded_gather, 1.f, 0.f, nullptr, nullptr);
+
+    // Rearrange
+    auto attn_out_view_flat = attn_out_b->view({num_heads, cur_seq_len, head_dim});
+    auto o_dst_flat = o_buf->view({ntok, num_heads, head_dim})->slice(0, token_offset, cur_seq_len);
+    for (size_t h = 0; h < num_heads; h++) {
+    auto src_h = attn_out_view_flat->slice(0, h, 1)->view({cur_seq_len, head_dim});
+    auto dst_h = o_dst_flat->slice(1, h, 1)->view({cur_seq_len, head_dim});
+    rearrange(dst_h, src_h);
+    }
+
+    token_offset += cur_seq_len;
+    } // End of Batch Loop
+
+    // Step 6: Final Output Projection
+    if (output_tensor) {
+    size_t context_dim = num_heads * head_dim; 
+    auto ctx_flat = o_buf->view({ntok, context_dim});
+    auto w_o = attn_weight->o_proj; 
+    size_t hidden_dim = meta.hidden_size; 
+    auto out_flat = output_tensor->view({ntok, hidden_dim});
+    linear(out_flat, ctx_flat, w_o, 1.0f, 0.0f, nullptr, nullptr);
+    }
+}
+
+// =============================================================================
+// Interface Exports
+// =============================================================================
+
+Qwen3MoEAttention::Qwen3MoEAttention(const Qwen3MoEAttentionMeta *_meta, const Qwen3MoEWeights *weights) : meta(*_meta) {
+    auto device_weights = weights->device_weights;
+    int ndev = device_weights.size();
+    device = device_weights[0]->device;
+    dev_ids.resize(ndev);
+    for (int i = 0; i < ndev; i++) {
+        dev_ids[i] = device_weights[i]->dev_id;
+    }
+    dev_resources = std::vector<Qwen3MoEDeviceResource>(ndev);
+    RUN_INFINI(infinirtInit());
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+    for (int i = 0; i < ndev; i++) {
+        createDeviceResource(&dev_resources[i], &meta, device_weights[i], device, i, ndev, dev_ids[i], comms[i]);
+    }
+}
+
+__C __export struct Qwen3MoEAttention *createQwen3MoEAttention(const Qwen3MoEAttentionMeta *_meta,
+                        const Qwen3MoEWeights *weights) {
+    Qwen3MoEAttention *attention = new Qwen3MoEAttention(_meta, weights);
+    return attention;
+}
+
+__C __export void destroyQwen3MoEAttention(struct Qwen3MoEAttention *ctx) {
+    if (!ctx) return;
+    auto ndev = ctx->dev_resources.size();
+    for (size_t idev = 0; idev < ndev; idev++) {
+        releaseDeviceResource(ctx->dev_resources[idev]);
+    }
+    delete ctx;
+}
+
+__C __export void forwardQwen3MoEAttention(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    const void* input_tensor,
+    void* output_tensor,
+    int batch_size,
+    const int* seq_lens_ptr,
+    const int* past_lens_ptr,
+    const int* pos_ids_ptr
+) {
+    if (!context || !kv_cache || !input_tensor || !output_tensor) {
+        return;
+    }
+    
+    size_t layer_id = 0;
+    if (context->dev_resources.empty()) return;
+    auto &rsrc = context->dev_resources[0];
+    auto meta = &context->meta;
+    auto dt_logits = meta->dtype;
+    size_t hidden_size = meta->hidden_size;
+    
+    std::vector<int> seq_lens(batch_size);
+    std::vector<int> past_lens(batch_size);
+    std::memcpy(seq_lens.data(), seq_lens_ptr, batch_size * sizeof(int));
+    std::memcpy(past_lens.data(), past_lens_ptr, batch_size * sizeof(int));
+    
+    size_t ntok = 0;
+    for (int len : seq_lens) ntok += len;
+    
+    std::shared_ptr<Tensor> input_hidden_states;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        input_hidden_states = Tensor::weight(const_cast<void*>(input_tensor), dt_logits, {ntok, hidden_size});
+    } else {
+        input_hidden_states = Tensor::buffer(dt_logits, {ntok, hidden_size}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(input_hidden_states->data(), const_cast<void*>(input_tensor),
+                                     dsize(dt_logits) * ntok * hidden_size,
+                                     INFINIRT_MEMCPY_H2D, rsrc.stream));
+    }
+    
+    std::shared_ptr<Tensor> pos_ids;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids = Tensor::weight(const_cast<int*>(pos_ids_ptr), INFINI_DTYPE_I32, {ntok});
+    } else {
+        pos_ids = Tensor::buffer(INFINI_DTYPE_I32, {ntok}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids->data(), (void*)pos_ids_ptr,
+                                     sizeof(int) * ntok,
+                                     INFINIRT_MEMCPY_H2D, rsrc.stream));
+    }
+    
+    auto output_tensor_ptr = Tensor::buffer(dt_logits, {ntok, hidden_size}, rsrc.memory_pool);
+    Qwen3Cache *qwen3_cache = reinterpret_cast<Qwen3Cache*>(kv_cache);
+    inferBatchQwen3MoE(context->meta, rsrc, input_hidden_states, pos_ids,
+                       output_tensor_ptr, qwen3_cache, layer_id,
+                       batch_size, seq_lens, past_lens);
+
+    RUN_INFINI(infinirtStreamSynchronize(rsrc.stream));
+
+    if (rsrc.device != INFINI_DEVICE_CPU) {
+        RUN_INFINI(infinirtMemcpyAsync(output_tensor, output_tensor_ptr->data(),
+        dsize(dt_logits) * ntok * hidden_size,
+        INFINIRT_MEMCPY_D2H, rsrc.stream));
+    }
+}
+
+__C __export void injectQwen3CacheKV(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    int layer_id,
+    int batch_idx,
+    int past_len,
+    const void* k_host_ptr, 
+    const void* v_host_ptr
+) {
+    if (!context || !kv_cache || past_len <= 0) return;
+
+    auto &rsrc = context->dev_resources[0];
+    RUN_INFINI(infinirtSetDevice(rsrc.device, rsrc.device_id));
+    auto meta = &context->meta;
+    auto memory_pool = rsrc.memory_pool;
+    auto stream = rsrc.stream;
+    
+    if (kv_cache->layers.size() <= static_cast<size_t>(layer_id)) {
+        kv_cache->layers.resize(static_cast<size_t>(layer_id) + 1);
+    }
+    auto &layer = kv_cache->layers[layer_id];
+    
+    size_t required_batch = batch_idx + 1;
+    size_t H = meta->num_kv_head;
+    size_t S = meta->max_seq_len;
+    size_t D = meta->head_dim;
+    
+    bool need_alloc = false;
+    if (!layer.first || !layer.second) {
+        need_alloc = true;
+    } else {
+        if (layer.first->shape()[0] < required_batch) need_alloc = true;
+    }
+    
+    // [FIX] Force minimum allocation size to avoid mid-loop resizing/resetting
+    size_t current_capacity = 0;
+    if (layer.first) current_capacity = layer.first->shape()[0];
+    size_t target_capacity = std::max(required_batch, (size_t)16);
+    
+    if (current_capacity < target_capacity) {
+        need_alloc = true;
+    }
+    
+    if (need_alloc) {
+        layer.first = Tensor::buffer(meta->dtype, {target_capacity, H, S, D}, memory_pool);
+        layer.second = Tensor::buffer(meta->dtype, {target_capacity, H, S, D}, memory_pool);
+        RUN_INFINI(infinirtStreamSynchronize(stream));
+    }
+
+    auto k_tensor = layer.first;
+    auto v_tensor = layer.second;
+
+    
+    size_t dtype_size = dsize(meta->dtype);
+    size_t stride_seq_bytes   = D * dtype_size;
+    size_t stride_head_bytes  = S * stride_seq_bytes;
+    size_t stride_batch_bytes = H * stride_head_bytes;
+
+    char* k_base = (char*)k_tensor->data();
+    char* v_base = (char*)v_tensor->data();
+    
+    char* k_batch_base = k_base + batch_idx * stride_batch_bytes;
+    char* v_batch_base = v_base + batch_idx * stride_batch_bytes;
+
+    const char* k_src_base = (const char*)k_host_ptr;
+    const char* v_src_base = (const char*)v_host_ptr;
+    
+    size_t src_head_stride_bytes = past_len * D * dtype_size;
+    size_t bytes_to_copy_per_head = past_len * D * dtype_size;
+
+    for (size_t h = 0; h < H; ++h) {
+        char* k_dst_addr = k_batch_base + h * stride_head_bytes;
+        char* v_dst_addr = v_batch_base + h * stride_head_bytes;
+
+        const char* k_src_addr = k_src_base + h * src_head_stride_bytes;
+        const char* v_src_addr = v_src_base + h * src_head_stride_bytes;
+
+        if (rsrc.device == INFINI_DEVICE_CPU) {
+            std::memcpy(k_dst_addr, k_src_addr, bytes_to_copy_per_head);
+            std::memcpy(v_dst_addr, v_src_addr, bytes_to_copy_per_head);
+        } else {
+            // [CUDA] Raw API
+            RUN_INFINI(infinirtMemcpyAsync(k_dst_addr, (void*)k_src_addr,
+                            bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
+            RUN_INFINI(infinirtMemcpyAsync(v_dst_addr, (void*)v_src_addr,
+                            bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
+        }
+    }
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+}
+
+extern "C" void customInjectCacheKV(
+    Qwen3Cache *kv_cache,
+    size_t layer_id,
+    int batch_idx,
+    int past_len,
+    void* k_src_ptr,
+    void* v_src_ptr,
+    cudaStream_t stream
+) {
+    int dev_id = 0;
+    cudaGetDevice(&dev_id);
+    RUN_INFINI(infinirtSetDevice(INFINI_DEVICE_NVIDIA, dev_id));
+
+    // 1. 安全检查
+    if (!kv_cache || kv_cache->layers.size() <= layer_id) {
+        std::cout<< "检查 unpass!" << std::endl;
+        return;
+
+    }
+    
+    auto &layer = kv_cache->layers[layer_id];
+    //std::cout<< layer_id << std::endl;
+    // 如果显存还没分配（Dummy Forward 没跑？），直接返回，Python侧会报错
+    if (!layer.first || !layer.second) {
+        printf(">>> [C++ Error] Cache not allocated yet! Run dummy forward first.\n");
+        return;
+    }
+
+    // 2. 获取 C++ 视角的形状信息
+    auto shape = layer.first->shape(); 
+    // shape: [Batch, NumKV, MaxSeq, HeadDim]
+    size_t batch_size = shape[0];
+    size_t num_kv = shape[1];
+    size_t max_seq = shape[2]; // 这里是关键！它是 8192
+    size_t head_dim = shape[3]; // 这里应该是 128
+
+    // 3. 计算 C++ 显存中的 Stride (稀疏布局)
+    size_t dtype_size = 2; // BF16 = 2 bytes
+    size_t stride_seq = head_dim * dtype_size;
+    size_t stride_head = max_seq * stride_seq;       // 跨越 8192 个 Token
+    size_t stride_batch = num_kv * stride_head;
+
+    // 4. 计算目标地址基址 (Base Address for this specific Batch)
+    char* k_dst_base = (char*)layer.first->data() + batch_idx * stride_batch;
+    char* v_dst_base = (char*)layer.second->data() + batch_idx * stride_batch;
+
+    // 5. 搬运循环
+    // Python 传来的数据是紧凑的: [NumKV, PastLen, HeadDim]
+    // 我们需要把每个 Head 的 [PastLen, HeadDim] 块搬运过去
+    
+    size_t copy_bytes_per_head = past_len * head_dim * dtype_size;
+    size_t src_stride_head = copy_bytes_per_head; // Python端是紧凑的
+
+    for (size_t h = 0; h < num_kv; ++h) {
+        // Source: Python (Compact)
+        char* k_src = (char*)k_src_ptr + h * src_stride_head;
+        char* v_src = (char*)v_src_ptr + h * src_stride_head;
+
+        // Dest: C++ (Sparse / Strided)
+        // 注意：我们从 sequence 的 index 0 开始写起
+        //int start_pos = past_len;
+        char* k_dst = k_dst_base + h * stride_head ;
+        char* v_dst = v_dst_base + h * stride_head ;
+
+        RUN_INFINI(infinirtMemcpyAsync(k_dst, k_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+        RUN_INFINI(infinirtMemcpyAsync(v_dst, v_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+    }
+    
+    // 简单同步确保写入完成
+    RUN_INFINI(infinirtStreamSynchronize((infinirtStream_t)stream));
+}
diff --git a/src/models/Qwen3MoE/Qwen3MoE_cache.cpp b/src/models/Qwen3MoE/Qwen3MoE_cache.cpp
new file mode 100644
index 00000000..ad1a3d92
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_cache.cpp
@@ -0,0 +1,50 @@
+#include "Qwen3MoE_impl.hpp"
+#include "infinicore_infer.h"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+// 注意：Qwen3MoECache 在头文件中声明，但实际使用 Qwen3Cache
+// 这里假设它们是同一个类型，或者Qwen3MoECache是Qwen3Cache的typedef
+
+/// @brief 创建KVCache
+__C __export struct Qwen3Cache *
+createQwen3Cache(const Qwen3MoEAttentionMeta *meta,
+                    size_t batch_size, size_t seq_len) {
+    Qwen3Cache *cache = new Qwen3Cache();
+    
+    // 假设只有1层attention（因为只实现attention模块）
+    size_t nlayer = 1;
+    size_t max_seq_len = meta->max_seq_len;
+    size_t num_kv_head = meta->num_kv_head;
+    size_t head_dim = meta->head_dim;
+    
+    // 为每一层创建K和V cache
+    // Cache shape: [num_kv_head, max_seq_len, head_dim]
+    cache->layers.resize(nlayer);
+    
+    for (size_t layer = 0; layer < nlayer; layer++) {
+        // 创建K cache: [num_kv_head, max_seq_len, head_dim]
+        auto k_cache = Tensor::buffer(meta->dtype, {num_kv_head, max_seq_len, head_dim});
+        
+        // 创建V cache: [num_kv_head, max_seq_len, head_dim]
+        auto v_cache = Tensor::buffer(meta->dtype, {num_kv_head, max_seq_len, head_dim});
+        
+        cache->layers[layer] = std::make_pair(k_cache, v_cache);
+    }
+    
+    return reinterpret_cast<struct Qwen3Cache *>(cache);
+}
+
+/// @brief 销毁KVCache（如果需要的话，可以添加这个函数）
+// 注意：头文件中没有声明这个函数，如果需要可以添加
+// __C void dropQwen3Cache(struct Qwen3Cache *cache) {
+//     if (cache) {
+//         Qwen3Cache *qwen3_cache = reinterpret_cast<Qwen3Cache *>(cache);
+//         for (auto &layer : qwen3_cache->layers) {
+//             layer.first.reset();
+//             layer.second.reset();
+//         }
+//         delete qwen3_cache;
+//     }
+// }
+
diff --git a/src/models/Qwen3MoE/Qwen3MoE_impl.hpp b/src/models/Qwen3MoE/Qwen3MoE_impl.hpp
new file mode 100644
index 00000000..909a0c0b
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_impl.hpp
@@ -0,0 +1,133 @@
+#ifndef QWEN3MOE_IMPL_H
+#define QWEN3MOE_IMPL_H
+
+#include "infinicore_infer.h"
+
+#include "../../allocator.hpp"
+#include "../../tensor.hpp"
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+struct QuantLinearWeight {
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> s; // Scale QUANT
+    std::shared_ptr<Tensor> z; // Zero QUANT
+}; 
+
+struct Qwen3AttentionWeight {
+    // Pre-Norm
+    std::shared_ptr<Tensor> attn_norm;
+    
+    // GQA
+    std::shared_ptr<Tensor> q_proj;
+    std::shared_ptr<Tensor> k_proj;
+    std::shared_ptr<Tensor> v_proj;
+    std::shared_ptr<Tensor> o_proj;
+
+    // QK Norm
+    std::shared_ptr<Tensor> q_norm;
+    std::shared_ptr<Tensor> k_norm;
+
+};
+
+struct Qwen3LayerWeight {
+    std::shared_ptr<Qwen3AttentionWeight> self_attn;
+
+    // TODO: 实现MLP Experts等， 由于比赛只实现attention模块
+    // 所以只放一个self_attn
+};
+
+struct Qwen3DeviceWeights {
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd;
+    
+    // RoPE
+    std::shared_ptr<Tensor> sin_table;
+    std::shared_ptr<Tensor> cos_table;
+
+    // layer
+    std::vector<Qwen3LayerWeight> w_layers;
+    
+    infiniDevice_t device;
+    int dev_id;
+    infinirtStream_t load_stream;
+};
+
+struct Qwen3MoEWeights {
+    // 即使是单卡，通常也用 vector 存，方便统一逻辑
+    std::vector<std::shared_ptr<Qwen3DeviceWeights>> device_weights;
+
+    // 构造函数声明
+    Qwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                    infiniDevice_t device,
+                    int ndev,
+                    const int *dev_ids);
+};
+
+/*
+Qwen3 KVCache
+[Batch, KV_Heads, Max_Seq, Head_Dim]    
+*/
+struct Qwen3Cache {
+    std::vector<std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>> layers;
+};
+
+struct Qwen3MoEDeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<Qwen3DeviceWeights> weights;
+    // Streams
+    infinirtStream_t stream;
+    // Communicator
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv_load, cv_start, cv_done;
+    bool loaded = false;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct Qwen3Cache **kv_caches;
+    const float *temperature;
+    const uint32_t *topk;
+    const float *topp;
+    uint32_t *output;
+    void *logits;
+};
+
+struct Qwen3MoEAttention { 
+    Qwen3MoEAttentionMeta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    
+    std::vector<Qwen3MoEDeviceResource> dev_resources;
+    
+    // 线程控制
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    // 构造函数
+    Qwen3MoEAttention(const Qwen3MoEAttentionMeta *, const Qwen3MoEWeights *weights);
+};
+
+
+#endif
\ No newline at end of file
diff --git a/src/models/Qwen3MoE/Qwen3MoE_weight.cpp b/src/models/Qwen3MoE/Qwen3MoE_weight.cpp
new file mode 100644
index 00000000..c3ead95d
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_weight.cpp
@@ -0,0 +1,268 @@
+#include "Qwen3MoE_impl.hpp"
+#include "infinicore_infer.h"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+
+#include <cmath>
+#include <iostream>
+
+// ==================== 辅助函数 ====================
+
+// 辅助函数：创建普通线性权重 (BF16)
+// 形状通常为 [in_dim, out_dim]，这是 InfiniLM 计算库的标准格式
+inline std::shared_ptr<Tensor> getLinear(
+    const Qwen3MoEAttentionMeta *meta, size_t in_dim, size_t out_dim) {
+    // 创建 BF16 权重张量
+    auto shape = std::vector<size_t>({in_dim, out_dim});
+    // 使用 meta->dtype 也可以，通常 meta->dtype 已经是 BF16
+    return Tensor::weight(nullptr, INFINI_DTYPE_BF16, shape);
+}
+
+// 辅助函数：分布式加载线性权重 (Tensor Parallel)
+// 即使 ndev=1 也能正常工作
+inline void load_dist_linear(void *w_ptr, std::shared_ptr<Tensor> w, 
+                             size_t ndev, size_t dev, infinirtStream_t stream) {
+    // 简单假设按输出维度切分 (Column Parallel)
+    // 偏移量 = 总元素数 / ndev * dev * 元素大小
+    size_t offset = w->shape()[0] * w->shape()[1] * dev * dsize(w->dtype());
+    w->load(reinterpret_cast<char *>(w_ptr) + offset, stream);
+}
+
+// 获取Attention Norm权重
+inline std::shared_ptr<Tensor> getAttnNorm(const Qwen3MoEAttentionMeta *meta) {
+    auto shape = std::vector<size_t>({meta->hidden_size});
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+
+// 1. 恢复标准 Sin/Cos 表 (适用于 64 dim -> 32 freqs)
+inline std::shared_ptr<Tensor> getSinTable(const Qwen3MoEAttentionMeta *meta) {
+    auto half_dh = meta->head_dim / 2; 
+    auto unit = dsize(meta->dtype);
+    void *table = std::malloc(meta->max_seq_len * half_dh * unit);
+    float theta = meta->rope_theta;
+
+    // 标准 Full RoPE 生成逻辑
+    for (size_t i = 0; i < meta->max_seq_len; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            // j = 0..31
+            float freq_exponent = static_cast<float>(j) / static_cast<float>(half_dh);
+            float freq = std::pow(theta, freq_exponent);
+            float _sin = std::sin(static_cast<float>(i) / freq);
+            
+            size_t idx = i * half_dh + j;
+            if (meta->dtype == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[idx] = f32_to_bf16(_sin);
+            } else if (meta->dtype == INFINI_DTYPE_F32) {
+                ((float *)table)[idx] = _sin;
+            }
+        }
+    }
+    // ... (Tensor 创建代码同上)
+    auto shape = std::vector<size_t>({meta->max_seq_len, half_dh});
+    auto tensor = Tensor::weight(table, meta->dtype, shape);
+    std::free(table);
+    return tensor;
+}
+
+// Cos 表同理，完全标准逻辑
+inline std::shared_ptr<Tensor> getCosTable(const Qwen3MoEAttentionMeta *meta) {
+    auto half_dh = meta->head_dim / 2;
+    auto unit = dsize(meta->dtype);
+    void *table = std::malloc(meta->max_seq_len * half_dh * unit);
+    float theta = meta->rope_theta;
+
+    // 标准 Full RoPE 生成逻辑
+    for (size_t i = 0; i < meta->max_seq_len; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            // j = 0..31
+            float freq_exponent = static_cast<float>(j) / static_cast<float>(half_dh);
+            float freq = std::pow(theta, freq_exponent);
+            float _cos = std::cos(static_cast<float>(i) / freq);
+            
+            size_t idx = i * half_dh + j;
+            if (meta->dtype == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[idx] = f32_to_bf16(_cos);
+            } else if (meta->dtype == INFINI_DTYPE_F32) {
+                ((float *)table)[idx] = _cos;
+            }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->max_seq_len, half_dh});
+    auto tensor = Tensor::weight(table, meta->dtype, shape);
+    std::free(table);
+    return tensor;
+}
+
+// 恢复 Norm 权重形状
+inline std::shared_ptr<Tensor> getQNorm(const Qwen3MoEAttentionMeta *meta) {
+    auto shape = std::vector<size_t>({meta->head_dim}); // 128
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+
+inline std::shared_ptr<Tensor> getKNorm(const Qwen3MoEAttentionMeta *meta) {
+    //std::cout<<"head dim"<<meta->head_dim<<std::endl;
+    auto shape = std::vector<size_t>({meta->head_dim}); // 128
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+// ==================== 构造函数 ====================
+
+Qwen3MoEWeights::Qwen3MoEWeights(
+    const Qwen3MoEAttentionMeta *meta,
+    infiniDevice_t device,
+    int ndev,
+    const int *dev_ids) {
+    
+    device_weights = std::vector<std::shared_ptr<Qwen3DeviceWeights>>(ndev);
+    
+    // 假设只有1层attention
+    size_t nlayer = 1;
+    
+    // 计算本地头数 (Tensor Parallel)
+    size_t local_num_heads = meta->num_heads / ndev;
+    size_t local_num_kv_heads = meta->num_kv_head / ndev;
+    
+    for (int dev = 0; dev < ndev; dev++) {
+        int dev_id = dev_ids[dev];
+        RUN_INFINI(infinirtSetDevice(device, dev_id));
+        device_weights[dev] = std::make_shared<Qwen3DeviceWeights>();
+        device_weights[dev]->device = device;
+        device_weights[dev]->dev_id = dev_id;
+        RUN_INFINI(infinirtStreamCreate(&device_weights[dev]->load_stream));
+
+        // 初始化RoPE表
+        device_weights[dev]->sin_table = getSinTable(meta);
+        device_weights[dev]->cos_table = getCosTable(meta);
+
+        // 初始化layers
+        device_weights[dev]->w_layers = std::vector<Qwen3LayerWeight>(nlayer);
+        
+        for (size_t layer = 0; layer < nlayer; layer++) {
+            auto attn_weight = std::make_shared<Qwen3AttentionWeight>();
+            
+            // Pre-Norm
+            attn_weight->attn_norm = getAttnNorm(meta);
+            
+            // Q/K/V投影（GQA + Tensor Parallel）
+            // 注意：这里 out_dim 使用本地头数计算
+            size_t q_out_dim = local_num_heads * meta->head_dim;
+            size_t kv_out_dim = local_num_kv_heads * meta->head_dim;
+            
+            // 【修改点】改为使用 getLinear 初始化普通 BF16 Tensor
+            attn_weight->q_proj = getLinear(meta, meta->hidden_size, q_out_dim);
+            attn_weight->k_proj = getLinear(meta, meta->hidden_size, kv_out_dim);
+            attn_weight->v_proj = getLinear(meta, meta->hidden_size, kv_out_dim);
+            
+            // QK Norm
+            attn_weight->q_norm = getQNorm(meta); 
+            attn_weight->k_norm = getKNorm(meta);
+            
+            // Output投影
+            // 注意：Output Proj 输入维度切分，输出维度完整 (Row Parallel 归约)
+            // 这里为了简化加载逻辑，我们暂时假设它也是普通 Linear
+            attn_weight->o_proj = getLinear(meta, q_out_dim, meta->hidden_size);
+            
+            device_weights[dev]->w_layers[layer].self_attn = attn_weight;
+        }
+    }
+}
+
+// ==================== 权重加载函数 (移除 Scale/Zero 参数) ====================
+
+// 加载Attention Norm
+void load_attn_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->attn_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载Q投影
+// 【修改点】去掉了 scale_ptr, zero_ptr
+void load_attn_q_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto q_proj = weight->w_layers[layer_id].self_attn->q_proj;
+        load_dist_linear(weight_ptr, q_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载K投影
+void load_attn_k_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto k_proj = weight->w_layers[layer_id].self_attn->k_proj;
+        load_dist_linear(weight_ptr, k_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载V投影
+void load_attn_v_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto v_proj = weight->w_layers[layer_id].self_attn->v_proj;
+        load_dist_linear(weight_ptr, v_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载Q Norm
+void load_attn_q_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->q_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载K Norm
+void load_attn_k_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->k_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载Output投影
+void load_attn_o_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto o_proj = weight->w_layers[layer_id].self_attn->o_proj;
+        load_dist_linear(weight_ptr, o_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 创建权重加载器
+// 【修改点】结构体定义需要去对应修改头文件，这里只填入函数指针
+static Qwen3MoEWeightLoader weight_loader = {
+    .load_attn_norm = load_attn_norm,
+    .load_attn_q_proj = load_attn_q_proj,
+    .load_attn_k_proj = load_attn_k_proj,
+    .load_attn_v_proj = load_attn_v_proj,
+    .load_attn_q_norm = load_attn_q_norm,
+    .load_attn_k_norm = load_attn_k_norm,
+    .load_attn_o_proj = load_attn_o_proj,
+};
+
+__C __export Qwen3MoEWeights *
+createQwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                      infiniDevice_t device,
+                      int ndev,
+                      const int *dev_ids) {
+    auto weights = new Qwen3MoEWeights(meta, device, ndev, dev_ids);
+    return weights;
+}
+
+__C __export Qwen3MoEWeightLoader *
+createQwen3MoEWeightLoader() {
+    return &weight_loader;
+}
\ No newline at end of file
diff --git a/src/models/deepseek_v3/deepseek_v3.cpp b/src/models/deepseek_v3/deepseek_v3.cpp
index 2c463035..8292d20b 100644
--- a/src/models/deepseek_v3/deepseek_v3.cpp
+++ b/src/models/deepseek_v3/deepseek_v3.cpp
@@ -8,7 +8,6 @@
 #include <random>
 #include <thread>
 #include <vector>
-
 void createDeviceResource(DeepSeekV3DeviceResource *rsrc, const DeepSeekV3Meta *meta,
                           std::shared_ptr<DeepSeekV3DeviceWeights> weights,
                           infiniDevice_t device, int idev,
diff --git a/src/models/deepseek_v3/deepseek_v3_cache.cpp b/src/models/deepseek_v3/deepseek_v3_cache.cpp
index a177fd8c..6750f19e 100644
--- a/src/models/deepseek_v3/deepseek_v3_cache.cpp
+++ b/src/models/deepseek_v3/deepseek_v3_cache.cpp
@@ -1,5 +1,6 @@
 #include "deepseek_v3_impl.hpp"
 
+
 __C struct DeepSeekV3Cache *
 createDeepSeekV3Cache(const struct DeepSeekV3Model *model) {
     DeepSeekV3Cache *cache = new DeepSeekV3Cache();
diff --git a/src/models/deepseek_v3/deepseek_v3_impl.hpp b/src/models/deepseek_v3/deepseek_v3_impl.hpp
index d4751074..aeadefae 100644
--- a/src/models/deepseek_v3/deepseek_v3_impl.hpp
+++ b/src/models/deepseek_v3/deepseek_v3_impl.hpp
@@ -12,105 +12,106 @@
 #include <thread>
 #include <vector>
 
-struct QuantLinearWeight {
-    std::shared_ptr<Tensor> w;
-    std::shared_ptr<Tensor> s;
-    std::shared_ptr<Tensor> z;
-};
-
-struct MLAWeight {
-    std::shared_ptr<Tensor> kv_a_norm, q_a_norm;
-    std::shared_ptr<QuantLinearWeight> kv_a_proj, kv_b_proj, o_proj, q_a_proj, q_b_proj;
-};
-
-struct GateWeight {
-    std::shared_ptr<Tensor> w;
-    std::shared_ptr<Tensor> b;
-};
-
-struct MLPWeight {
-    std::shared_ptr<QuantLinearWeight> gate, up, down;
-};
-
-struct LayerWeight {
-    std::shared_ptr<Tensor> mla_norm;
-    std::shared_ptr<MLAWeight> mla;
-    std::shared_ptr<Tensor> mlp_norm;
-    std::shared_ptr<MLPWeight> dense_mlp;
-    std::shared_ptr<GateWeight> route;
-    std::shared_ptr<MLPWeight> share_expert;
-    std::vector<std::shared_ptr<MLPWeight>> experts;
-};
-
-struct DeepSeekV3DeviceWeights {
-    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
-        cos_table;
-    std::vector<LayerWeight> w_layers;
-    infiniDevice_t device;
-    int dev_id;
-    infinirtStream_t load_stream;
-};
-
-struct DeepSeekV3Weights {
-    std::vector<std::shared_ptr<DeepSeekV3DeviceWeights>> device_weights;
-
-    DeepSeekV3Weights(const DeepSeekV3Meta *meta,
-                      infiniDevice_t device,
-                      int ndev,
-                      const int *dev_ids);
-};
-
-struct DeepSeekV3DeviceResource {
-    // Device
-    infiniDevice_t device;
-    int device_id;
-    infiniopHandle_t handle;
-    // Weights
-    std::shared_ptr<DeepSeekV3DeviceWeights> weights;
-    // Streams
-    infinirtStream_t stream;
-    // Communicator
-    infinicclComm_t comm;
-
-    std::shared_ptr<MemoryPool> memory_pool;
-};
-
-struct InferState {
-    std::mutex mtx;
-    std::condition_variable cv_load, cv_start, cv_done;
-    bool loaded = false;
-    bool proceed = false;
-    bool exit_flag = false;
-};
-
-struct InferRequest {
-    const uint32_t *tokens;
-    uint32_t ntok;
-    const uint32_t *req_lens;
-    uint32_t nreq;
-    const uint32_t *req_pos;
-    struct DeepSeekV3Cache **kv_caches;
-    const float *temperature;
-    const uint32_t *topk;
-    const float *topp;
-    uint32_t *output;
-    void *logits;
-};
-
-struct DeepSeekV3Model {
-    DeepSeekV3Meta meta;
-    infiniDevice_t device;
-    std::vector<int> dev_ids;
-    std::vector<DeepSeekV3DeviceResource> dev_resources;
-    std::vector<InferState> states;
-    std::vector<std::thread> threads;
-    InferRequest req;
-
-    DeepSeekV3Model(const DeepSeekV3Meta *, const DeepSeekV3Weights *weights);
-};
-
-struct DeepSeekV3Cache {
-    std::vector<std::vector<std::shared_ptr<Tensor>>> kv_pass, k_rot;
-};
+    struct QuantLinearWeight {
+        std::shared_ptr<Tensor> w;
+        std::shared_ptr<Tensor> s;
+        std::shared_ptr<Tensor> z;
+    };
+    
+    struct MLAWeight {
+        std::shared_ptr<Tensor> kv_a_norm, q_a_norm;
+        std::shared_ptr<QuantLinearWeight> kv_a_proj, kv_b_proj, o_proj, q_a_proj, q_b_proj;
+    };
+    
+    struct GateWeight {
+        std::shared_ptr<Tensor> w;
+        std::shared_ptr<Tensor> b;
+    };
+    
+    struct MLPWeight {
+        std::shared_ptr<QuantLinearWeight> gate, up, down;
+    };
+    
+    struct LayerWeight {
+        std::shared_ptr<Tensor> mla_norm;
+        std::shared_ptr<MLAWeight> mla;
+        std::shared_ptr<Tensor> mlp_norm;
+        std::shared_ptr<MLPWeight> dense_mlp;
+        std::shared_ptr<GateWeight> route;
+        std::shared_ptr<MLPWeight> share_expert;
+        std::vector<std::shared_ptr<MLPWeight>> experts;
+    };
+    
+    struct DeepSeekV3DeviceWeights {
+        std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+            cos_table;
+        std::vector<LayerWeight> w_layers;
+        infiniDevice_t device;
+        int dev_id;
+        infinirtStream_t load_stream;
+    };
+    
+    struct DeepSeekV3Weights {
+        std::vector<std::shared_ptr<DeepSeekV3DeviceWeights>> device_weights;
+    
+        DeepSeekV3Weights(const DeepSeekV3Meta *meta,
+                          infiniDevice_t device,
+                          int ndev,
+                          const int *dev_ids);
+    };
+    
+    struct DeepSeekV3DeviceResource {
+        // Device
+        infiniDevice_t device;
+        int device_id;
+        infiniopHandle_t handle;
+        // Weights
+        std::shared_ptr<DeepSeekV3DeviceWeights> weights;
+        // Streams
+        infinirtStream_t stream;
+        // Communicator
+        infinicclComm_t comm;
+    
+        std::shared_ptr<MemoryPool> memory_pool;
+    };
+    
+    struct InferState {
+        std::mutex mtx;
+        std::condition_variable cv_load, cv_start, cv_done;
+        bool loaded = false;
+        bool proceed = false;
+        bool exit_flag = false;
+    };
+    
+    struct InferRequest {
+        const uint32_t *tokens;
+        uint32_t ntok;
+        const uint32_t *req_lens;
+        uint32_t nreq;
+        const uint32_t *req_pos;
+        struct DeepSeekV3Cache **kv_caches;
+        const float *temperature;
+        const uint32_t *topk;
+        const float *topp;
+        uint32_t *output;
+        void *logits;
+    };
+    
+    struct DeepSeekV3Model {
+        DeepSeekV3Meta meta;
+        infiniDevice_t device;
+        std::vector<int> dev_ids;
+        std::vector<DeepSeekV3DeviceResource> dev_resources;
+        std::vector<InferState> states;
+        std::vector<std::thread> threads;
+        InferRequest req;
+    
+        DeepSeekV3Model(const DeepSeekV3Meta *, const DeepSeekV3Weights *weights);
+    };
+    
+    struct DeepSeekV3Cache {
+        std::vector<std::vector<std::shared_ptr<Tensor>>> kv_pass, k_rot;
+    };
+    
 
 #endif
diff --git a/src/models/deepseek_v3/deepseek_v3_weight.cpp b/src/models/deepseek_v3/deepseek_v3_weight.cpp
index 846af633..d55acc44 100644
--- a/src/models/deepseek_v3/deepseek_v3_weight.cpp
+++ b/src/models/deepseek_v3/deepseek_v3_weight.cpp
@@ -1,7 +1,6 @@
 #include "deepseek_v3_impl.hpp"
 
 #include <cmath>
-
 inline std::shared_ptr<Tensor> getInEmbd(
     const DeepSeekV3Meta *meta) {
     auto shape = std::vector<size_t>({meta->dvoc, meta->d});
diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp
index db5fda11..2a936db0 100644
--- a/src/models/inference_context.cpp
+++ b/src/models/inference_context.cpp
@@ -64,6 +64,21 @@ void InferenceContext::gemm(std::shared_ptr<Tensor> c,
 
     infiniopGemmDescriptor_t desc;
     if (!cache_manager->getGemmDescriptor(key, desc)) {
+        // Debug: print tensor metadata to help diagnose descriptor creation errors
+        auto print_tensor = [&](const std::shared_ptr<Tensor>& t, const char* name) {
+
+            auto s = t->strides();
+            
+        };
+
+        try {
+            print_tensor(c, "C");
+            print_tensor(a, "A");
+            print_tensor(b, "B");
+        } catch (...) {
+            std::cout << "[InferenceContext::gemm] Failed to print tensor metadata" << std::endl;
+        }
+
         RUN_INFINI(infiniopCreateGemmDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
         cache_manager->putGemmDescriptor(key, desc);
     }
diff --git a/test/models/qwen3_moe/attention_test.py b/test/models/qwen3_moe/attention_test.py
index 26f66e40..ef0d8d3c 100644
--- a/test/models/qwen3_moe/attention_test.py
+++ b/test/models/qwen3_moe/attention_test.py
@@ -1,484 +1,457 @@
 import os
 import time
 import sys
+import json
 import safetensors
 import torch
+import numpy as np
+import ctypes
+from ctypes import byref, POINTER, c_int, c_float, c_void_p, c_size_t, Structure
 from transformers import AutoConfig
 from transformers import DynamicCache
 from transformers.models import qwen3_moe
 
-WARMUPS = 10
-RUNS = 100
-PREFILL_TESTCASES = {"seqlens": [64, 128, 256, 256], "pastlens": [512, 0, 0, 256]}
+# ==============================================================================
+# 1. Ctypes Setup
+# ==============================================================================
+SO_PATH = "build/linux/x86_64/release/libinfinicore_infer.so"
+if not os.path.exists(SO_PATH):
+    SO_PATH = os.path.expanduser("~/.infini/lib/libinfinicore_infer.so")
+
+if not os.path.exists(SO_PATH):
+    print(f"Warning: Cannot find libinfinicore_infer.so at {SO_PATH}.")
+    LIB_INFINILM = None
+else:
+    LIB_INFINILM = ctypes.CDLL(SO_PATH)
+
+class DataType:
+    INFINI_DTYPE_BF16 = 19
+
+class DeviceType:
+    DEVICE_TYPE_NVIDIA = 1
+
+class Qwen3MoEAttentionMetaCStruct(Structure):
+    _fields_ = [
+        ("dtype", c_int),
+        ("hidden_size", c_size_t),
+        ("num_heads", c_size_t),
+        ("num_kv_head", c_size_t),
+        ("head_dim", c_size_t),
+        ("rope_theta", c_float),
+        ("max_seq_len", c_size_t),
+        ("rms_norm_eps", c_float),
+    ]
+
+class Qwen3MoEWeightLoader(Structure):
+    _fields_ = [
+        ("load_attn_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_q_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_k_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_v_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_q_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_k_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_o_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+    ]
+
+class Qwen3MoEAttention(Structure): pass
+class Qwen3MoEWeights(Structure): pass
+class Qwen3Cache(Structure): pass
+
+if LIB_INFINILM:
+    LIB_INFINILM.createQwen3MoEWeights.restype = POINTER(Qwen3MoEWeights)
+    LIB_INFINILM.createQwen3MoEWeightLoader.restype = POINTER(Qwen3MoEWeightLoader)
+    LIB_INFINILM.createQwen3MoEAttention.restype = POINTER(Qwen3MoEAttention)
+    LIB_INFINILM.createQwen3Cache.restype = POINTER(Qwen3Cache)
+    LIB_INFINILM.createQwen3Cache.argtypes = [POINTER(Qwen3MoEAttentionMetaCStruct), c_size_t, c_size_t]
+
+    LIB_INFINILM.forwardQwen3MoEAttention.argtypes = [
+        POINTER(Qwen3MoEAttention), POINTER(Qwen3Cache),
+        c_void_p, c_void_p, c_int, POINTER(c_int), POINTER(c_int), POINTER(c_int)
+    ]
+    LIB_INFINILM.injectQwen3CacheKV.argtypes = [
+        POINTER(Qwen3MoEAttention), POINTER(Qwen3Cache),
+        c_int, c_int, c_int, c_void_p, c_void_p
+    ]
+
+global_tensor_keepalive = []
+
+def get_ptr(numpy_array):
+    if not numpy_array.flags['C_CONTIGUOUS']:
+        numpy_array = np.ascontiguousarray(numpy_array)
+    ptr = numpy_array.ctypes.data_as(c_void_p)
+    global_tensor_keepalive.append(numpy_array)
+    return ptr
+
+# ==============================================================================
+# 2. InfiniLM Wrapper
+# ==============================================================================
+class InfiniLMWrapper:
+    def __init__(self, config, torch_model, device_id=0):
+        if not LIB_INFINILM: raise RuntimeError("Library not loaded")
+        
+        # [TRUTH] 物理真值是 128
+        self.real_hidden = config.hidden_size 
+        real_head_dim = 128
+        
+        self.meta = Qwen3MoEAttentionMetaCStruct(
+            dtype=DataType.INFINI_DTYPE_BF16, 
+            hidden_size=config.hidden_size, 
+            num_heads=config.num_attention_heads,
+            num_kv_head=config.num_key_value_heads,
+            head_dim=real_head_dim,
+            rope_theta=config.rope_theta,
+            max_seq_len=8192,
+            rms_norm_eps=config.rms_norm_eps
+        )
+        self.weights_handle = LIB_INFINILM.createQwen3MoEWeights(byref(self.meta), DeviceType.DEVICE_TYPE_NVIDIA, 1, (c_int * 1)(device_id))
+        self.loader = LIB_INFINILM.createQwen3MoEWeightLoader()
+        self._load_weights(torch_model)
+        self.attn_ctx = LIB_INFINILM.createQwen3MoEAttention(byref(self.meta), self.weights_handle)
+        self.kv_cache = LIB_INFINILM.createQwen3Cache(byref(self.meta), 0, 0)
+    
+    def _load_weights(self, model):
+        def load(tensor, loader_func, transpose=False):
+            if tensor is None: return
+            w_pt = tensor.detach().to(torch.float32)
+            if transpose: w_pt = w_pt.t()
+            w_bf16 = w_pt.to(torch.bfloat16).view(torch.int16).cpu().numpy()
+            loader_func(self.weights_handle, get_ptr(w_bf16), 0)
+
+        load(model.q_proj.weight, self.loader.contents.load_attn_q_proj, transpose=True)
+        load(model.k_proj.weight, self.loader.contents.load_attn_k_proj, transpose=True)
+        load(model.v_proj.weight, self.loader.contents.load_attn_v_proj, transpose=True)
+        load(model.o_proj.weight, self.loader.contents.load_attn_o_proj, transpose=True)
+        
+        if hasattr(model, 'q_norm') and model.q_norm is not None:
+            load(model.q_norm.weight, self.loader.contents.load_attn_q_norm, transpose=False)
+        if hasattr(model, 'k_norm') and model.k_norm is not None:
+            load(model.k_norm.weight, self.loader.contents.load_attn_k_norm, transpose=False)
+
+    def inject_cache(self, layer_id, batch_idx, k_torch, v_torch):
+        """
+        将 PyTorch 的 KV Cache (BFloat16) 注入到 InfiniLM 的 Cache 中
+        k_torch, v_torch shape: [num_kv_heads, past_len, head_dim]
+        """
+        if k_torch is None or v_torch is None: return
+        
+        # 转换为 numpy int16 (模拟 bf16) 且保证 C 连续
+        k_np = k_torch.detach().cpu().view(torch.int16).numpy().copy(order='C')
+        v_np = v_torch.detach().cpu().view(torch.int16).numpy().copy(order='C')
+        past_len = k_np.shape[1] 
+        
+        LIB_INFINILM.injectQwen3CacheKV(
+            self.attn_ctx, self.kv_cache,
+            c_int(layer_id), c_int(batch_idx), c_int(past_len),
+            get_ptr(k_np), get_ptr(v_np)
+        )
 
-DECODE_TESTCASES = {
-    "seqlens": [1 for _ in range(16)],
-    "pastlens": [50 for _ in range(4)]
-    + [100 for _ in range(4)]
-    + [200 for _ in range(4)]
-    + [400 for _ in range(4)],
-}
+    def forward(self, input_bf16_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False):
+        q_out_dim = self.meta.num_heads * self.meta.head_dim 
+        out_dim = q_out_dim if return_raw else self.real_hidden
+        output = np.zeros((input_bf16_np.shape[0], out_dim), dtype=np.int16)
+        
+        LIB_INFINILM.forwardQwen3MoEAttention(
+            self.attn_ctx, self.kv_cache, 
+            get_ptr(input_bf16_np), get_ptr(output),
+            c_int(batch_size), (c_int*batch_size)(*seq_lens), 
+            (c_int*batch_size)(*past_lens), (c_int*len(pos_ids))(*pos_ids)
+        )
+        return output
 
+# ==============================================================================
+# 3. Utilities
+# ==============================================================================
+WARMUPS = 10
+RUNS = 100
+PREFILL_TESTCASES = {"seqlens": [64,128,256,256], "pastlens": [512,0,0,256]}
+DECODE_TESTCASES = {"seqlens": [1] * 16, "pastlens": [504]*4 + [1004]*4 + [2004]*4 + [4004]*4}
 
 def get_args():
     import argparse
-
-    parser = argparse.ArgumentParser(description="Test Operator")
-    parser.add_argument(
-        "--model_path",
-        action="store",
-        help="The directory of the model to be tested",
-    )
-
-    parser.add_argument(
-        "--cpu",
-        action="store_true",
-        help="Run cpu test",
-    )
-
-    parser.add_argument(
-        "--nvidia",
-        action="store_true",
-        help="Run nvidia test",
-    )
-
-    parser.add_argument(
-        "--metax",
-        action="store_true",
-        help="Run metax test",
-    )
-    parser.add_argument(
-        "--moore",
-        action="store_true",
-        help="Run moore test",
-    )
-    parser.add_argument(
-        "--iluvatar",
-        action="store_true",
-        help="Run iluvatar test",
-    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", required=True)
+    parser.add_argument("--nvidia", action="store_true")
     return parser.parse_args()
 
+def create_Qwen3attention_torch(dir_path, device, dtype=torch.bfloat16):
+    config = AutoConfig.from_pretrained(dir_path)
 
-def torch_synchronize(_device):
-    if _device == "cuda":
-        torch.cuda.synchronize()
-    elif _device == "musa":
-        torch.musa.synchronize()
-
-
-def torch_empty_cache(_device):
-    if _device == "cuda":
-        torch.cuda.empty_cache()
-    elif _device == "musa":
-        torch.musa.empty_cache()
-
+    real_head_dim = 128 
+    config.head_dim = real_head_dim 
 
-def create_Qwen3attention_torch(dir_path, *, device, dtype=torch.bfloat16):
-    config = AutoConfig.from_pretrained(dir_path)
     config.num_hidden_layers = 1
     config._attn_implementation = "sdpa"
-
-    # --------------------------------------------------------------------------------#
-    #                创建只包含 attention的模型
-    # --------------------------------------------------------------------------------#
-    model = qwen3_moe.modeling_qwen3_moe.Qwen3MoeAttention(config, layer_idx=0).to(
-        device=device, dtype=dtype
-    )
+    
+    model = qwen3_moe.modeling_qwen3_moe.Qwen3MoeAttention(config, layer_idx=0).to(device=device, dtype=dtype)
+    
     tensors = {}
     for fname in sorted(os.listdir(dir_path)):
-        if not fname.endswith(".safetensors"):
-            continue
-        fpath = os.path.join(dir_path, fname)
-        with safetensors.safe_open(fpath, framework="pt") as f:
+        if not fname.endswith(".safetensors"): continue
+        with safetensors.safe_open(os.path.join(dir_path, fname), framework="pt") as f:
             for key in f.keys():
                 if "model.layers.0.self_attn." in key:
                     tensors[key[len("model.layers.0.self_attn.") :]] = f.get_tensor(key)
         break
-    model.load_state_dict(tensors)
-
-    # --------------------------------------------------------------------------------#
-    #                创建 rotary_emb 类
-    # --------------------------------------------------------------------------------#
-    rotary_emb = qwen3_moe.modeling_qwen3_moe.Qwen3MoeRotaryEmbedding(
-        config, device=device
-    )
-    return model, rotary_emb
-
-
-def generate_attention_input_torch(
-    model, rotary_emb, testcase, device, dtype=torch.bfloat16
-):
+    
+    model.load_state_dict(tensors, strict=False)
+    
+    if model.q_proj.bias is not None: torch.nn.init.zeros_(model.q_proj.bias)
+    if model.k_proj.bias is not None: torch.nn.init.zeros_(model.k_proj.bias)
+    if model.v_proj.bias is not None: torch.nn.init.zeros_(model.v_proj.bias)
+    if model.o_proj.bias is not None: torch.nn.init.zeros_(model.o_proj.bias)
+
+    rotary_emb = qwen3_moe.modeling_qwen3_moe.Qwen3MoeRotaryEmbedding(config, device=device)
+    return model, rotary_emb, config
+
+def prepare_inputs(model, testcase, device, dtype):
     config = model.config
-    hidden_size = config.hidden_size  # 2048
-    head_dim = config.head_dim  # 128
-    num_key_value_heads = config.num_key_value_heads
     bs = 1
-
     req_list = []
+    
     for seq_lens, past_lens in zip(testcase["seqlens"], testcase["pastlens"]):
-        hidden_states = torch.rand(
-            (bs, seq_lens, hidden_size), device=device, dtype=dtype
-        )
-
-        attention_mask = None
-
+        hidden_states = torch.rand((bs, seq_lens, config.hidden_size), device=device, dtype=dtype)
         past_key_values = DynamicCache(config=config)
-        key_states = torch.rand(
-            (bs, num_key_value_heads, past_lens, head_dim), device=device, dtype=dtype
-        )
-        value_states = torch.rand(
-            (bs, num_key_value_heads, past_lens, head_dim), device=device, dtype=dtype
-        )
-        past_key_values.update(key_states, value_states, 0)
-
-        req = {
-            "hidden_states": hidden_states,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-        }
-        req_list.append(req)
-
-    return req_list
-
-
-def benchmark_Qwen3attention_prefill_torch(
-    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
-):
-    """
-    Test Qwen3attention.
-
-    """
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, test_cases, device, dtype=dtype
-    )
-    req_out_list = []
-    for req in req_list:
-        # ----------------------------------------- #
-        #         获得每个req的数据
-        # ----------------------------------------- #
-        hidden_states = req["hidden_states"]
-        attention_mask = req["attention_mask"]
-        past_key_values = req["past_key_values"]
-
-        # ----------------------------------------- #
-        #         计算当前所需的sin_table，sin_table
-        # ----------------------------------------- #
-        cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-        bs, seq_len, _ = hidden_states.shape
-
-        position_ids = torch.arange(
-            cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-        ).reshape((bs, seq_len))
-
-        cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-        position_embeddings = (sin_table, cos_table)
-
-        # ----------------------------------------- #
-        #            计算一次
-        # ----------------------------------------- #
-        output_device, _ = model(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-
-        # ----------------------------------------- #
-        #            得到结果，存储下来
-        # ----------------------------------------- #
-        output_host = output_device.to("cpu")
-        req_out_list.append(output_host)
-
-    torch_synchronize(device)
-
+        
+        # [CRITICAL] 恢复为 torch.rand！
+        # 现在我们通过 inject_cache 保证 C++ 拿到完全一样的随机数
+        if past_lens > 0:
+            k = torch.rand((bs, config.num_key_value_heads, past_lens, config.head_dim), device=device, dtype=dtype)
+            v = torch.rand((bs, config.num_key_value_heads, past_lens, config.head_dim), device=device, dtype=dtype)
+            past_key_values.update(k, v, 0)
+        req_list.append({"hidden_states": hidden_states, "attention_mask": None, "past_key_values": past_key_values})
+
+    all_hs = [req["hidden_states"].squeeze(0) for req in req_list]
+    flat_input = torch.cat(all_hs, dim=0) 
+    
+    input_np = flat_input.cpu().view(torch.int16).numpy().copy(order='C')
+    
+    seq_lens = testcase["seqlens"]
+    past_lens = testcase["pastlens"]
+    pos_ids = []
+    for s, p in zip(seq_lens, past_lens):
+        pos_ids.extend(range(p, p+s))
+        
+    return req_list, input_np, seq_lens, past_lens, pos_ids
+
+def check_correctness_prefill(torch_outs, infinilm_out_np, device):
+    if not torch_outs:
+        print("❌ Error: Torch Output is empty.")
+        return
+
+    torch_flat = torch.cat([out.float().view(-1, out.shape[-1]) for out in torch_outs], dim=0).to("cpu")
+    
+    infini_tensor_int16 = torch.from_numpy(infinilm_out_np)
+    infini_flat = infini_tensor_int16.view(torch.bfloat16).float().view(-1, torch_flat.shape[-1])
+
+    cos_sim = torch.nn.functional.cosine_similarity(torch_flat, infini_flat, dim=-1).mean().item()
+    print(f"Cosine Similarity: {cos_sim:.6f}")
+    
+    if cos_sim > 0.98: print("✅ Result Match")
+    else: print("❌ Result Mismatch")
+
+def check_correctness_decode(torch_outs, infinilm_out_np, device):
+    if not torch_outs:
+        print("❌ Error: Torch Output is empty.")
+        return
+
+    torch_flat = torch.cat([out.float().view(-1, out.shape[-1]) for out in torch_outs], dim=0).to("cpu")
+    
+    infini_tensor_int16 = torch.from_numpy(infinilm_out_np)
+    infini_flat = infini_tensor_int16.view(torch.bfloat16).float().view(-1, torch_flat.shape[-1])
+
+    cos_sim = torch.nn.functional.cosine_similarity(torch_flat, infini_flat, dim=-1).mean().item()
+    print(f"Cosine Similarity: {cos_sim:.6f}")
+    ## for decode, 0.95 enough 
+    if cos_sim > 0.95: print("✅ Result Match")
+    else: print("❌ Result Mismatch")
+
+
+def benchmark_prefill(model, rotary_emb, infinilm_model, test_cases, device, dtype):
+    print(f"\n{'='*40} PREFILL {'='*40}")
+    req_list, input_np, seq_lens, past_lens, pos_ids = prepare_inputs(model, test_cases, device, dtype)
+    batch_size = len(seq_lens)
+    
+    # =======================================================
+    # Torch Run
+    # =======================================================
     for _ in range(WARMUPS):
         for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #          恢复 kv chche的长度
-            # ----------------------------------------- #
-            origin_len = test_cases["pastlens"][i]
-            req["past_key_values"].crop(origin_len)
-
-        for req in req_list:
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                      attention_mask=req["attention_mask"],
+                      past_key_values=req["past_key_values"])
+    torch.cuda.synchronize()
+
+    
+    torch_out_list = []
     time_consuming = 0
-    for _ in range(RUNS):
+    for run_idx in range(RUNS):
         for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #          恢复 kv chche的长度
-            # ----------------------------------------- #
-            origin_len = test_cases["pastlens"][i]
-            req["past_key_values"].crop(origin_len)
-
-        torch_synchronize(device)
+            # 1. Reset KV Cache to initial state
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            
+            q_len = seq_len
+            k_len = cache_len + seq_len
+            past_len = cache_len
+            
+            causal_mask = torch.zeros((q_len, k_len), device=device, dtype=dtype)
+            for j in range(q_len):
+                valid_limit = past_len + j + 1
+                if valid_limit < k_len:
+                    causal_mask[j, valid_limit:] = float("-inf")
+            req["attention_mask"] = causal_mask[None, None, :, :]
         # ----------------------------------------- #
         #       重要：每个req都按整个batch的起始时间计算
         # ----------------------------------------- #
-        start_time = time.time()
-
-        for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-            torch_synchronize(device)
+        torch.cuda.synchronize()
+        start = time.time()
+        for i, req in enumerate(req_list):     
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            # Position IDs
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            out, _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                           attention_mask=req["attention_mask"],
+                           past_key_values=req["past_key_values"])
+            torch.cuda.synchronize()
             end_time = time.time()
-
-            # 记录每个req从进入所有req进入推理到自己结束的时间
-            time_consuming += end_time - start_time
-
+            time_consuming += end_time - start
+            if run_idx == RUNS - 1:
+                torch_out_list.append(out.detach().to("cpu"))
+    torch.cuda.synchronize()
     out_token_count = RUNS * len(req_list)
+    t_lat = time_consuming * 1000 / out_token_count
 
-    latency = time_consuming * 1000 / out_token_count
-
-    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average TTFT: {round(latency, 2)} ms\n"
-    )
-
-    return req_out_list
-
-
-def benchmark_Qwen3attention_decode_torch(
-    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
-):
-    """
-    Test Qwen3attention_decode.
-    """
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, test_cases, device, dtype=dtype
-    )
-    req_out_list = []
-    for req in req_list:
-        # ----------------------------------------- #
-        #         获得每个req的数据
-        # ----------------------------------------- #
-        hidden_states = req["hidden_states"]
-        attention_mask = req["attention_mask"]
-        past_key_values = req["past_key_values"]
-
-        # ----------------------------------------- #
-        #         计算当前所需的sin_table，sin_table
-        # ----------------------------------------- #
-        cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-        bs, seq_len, _ = hidden_states.shape
-
-        position_ids = torch.arange(
-            cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-        ).reshape((bs, seq_len))
-
-        cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-        position_embeddings = (sin_table, cos_table)
-
-        ##
-        output_device, _ = model(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-        output_host = output_device.to("cpu")
-
-        req_out_list.append(output_host)
-
-    torch_synchronize(device)
-
-    for req in req_list:
-        for _ in range(WARMUPS):
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-    # ----------------------------------------- #
-    #          恢复 kv chche的长度
-    # ----------------------------------------- #
+    # =======================================================
+    # InfiniLM Run
+    # =======================================================
+    print(">>> Injecting Cache to InfiniLM...")
     for i, req in enumerate(req_list):
-        origin_len = test_cases["pastlens"][i]
-        req["past_key_values"].crop(origin_len)
+        if past_lens[i] > 0:
+            k_cache = req["past_key_values"][0][0].squeeze(0) 
+            v_cache = req["past_key_values"][0][1].squeeze(0)
+            infinilm_model.inject_cache(0, i, k_cache, v_cache)
 
-    torch_synchronize(device)
-    start_time = time.time()
+    for _ in range(WARMUPS):
+        _ = infinilm_model.forward(input_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False)
+    torch.cuda.synchronize()
 
-    for i, req in enumerate(req_list):
-        for _ in range(RUNS):
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # -------------------------------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # -------------------------------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # -------------------------------------------------------------- #
-            #            计算一次
-            # -------------------------------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-            # -------------------------------------------------------------- #
-            #            更新hidden_states, ( DynamicCache的类自动更新)
-            # -------------------------------------------------------------- #
-            req["hidden_states"] = output_device
-
-    torch_synchronize(device)
-    end_time = time.time()
-
-    time_consuming = end_time - start_time
+    start = time.time()
+    infini_out = None
+    for _ in range(RUNS):
+        # Repeatedly run with same inputs (simulating same-shape prefill)
+        # Note: We assume InfiniLM overwrites/resets based on past_lens parameter
+        infini_out = infinilm_model.forward(input_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False)
+    torch.cuda.synchronize()
     out_token_count = RUNS * len(req_list)
+    i_lat = (time.time() - start) * 1000 / out_token_count
 
-    throughput = out_token_count / time_consuming
+    print(f"Latency: Torch={t_lat:.3f}ms, Infini={i_lat:.3f}ms")
+    check_correctness_prefill(torch_out_list, infini_out, device)
 
-    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average throughput: {round(throughput, 2)} tok/s \n"
-    )
 
-    return req_out_list
+def benchmark_decode(model, rotary_emb, infinilm_model, test_cases, device, dtype):
+    print(f"\n{'='*40} DECODE {'='*40}")
+    req_list, input_np, seq_lens, past_lens, pos_ids = prepare_inputs(model, test_cases, device, dtype)
+    batch_size = len(seq_lens)
+    total_tokens_per_round = sum(seq_lens)
 
+    # Capture initial KV for InfiniLM injection (before Torch modifies them)
+    initial_kv = []
+    for req in req_list:
+        if req["past_key_values"].get_seq_length() > 0:
+            k = req["past_key_values"][0][0].detach().clone()
+            v = req["past_key_values"][0][1].detach().clone()
+            initial_kv.append((k, v))
+        else:
+            initial_kv.append(None)
+
+    # =======================================================
+    # Torch Run
+    # =======================================================
+    # Note: No Warmup mentioned in requirements for "Sequential inference 100 rounds", 
+    # but usually we might warm up. However, since state changes, warmup is part of the sequence.
+    # We will just run the 100 rounds as the benchmark.
+
+    torch_out_list = []
+    torch.cuda.synchronize()
+    start = time.time()
+    for run_idx in range(RUNS):
+        for i, req in enumerate(req_list):
+            # Do NOT crop cache - let it grow
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1] # Should be 1
+            
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            
+            # Decode: attention_mask is None (causal implied for len 1)
+            out, _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                           attention_mask=None,
+                           past_key_values=req["past_key_values"])
+            
+            # Update input for next round
+            req["hidden_states"] = out
+            
+            if run_idx == RUNS - 1:
+                torch_out_list.append(out.detach().to("cpu"))
+                
+    torch.cuda.synchronize()
+    end = time.time()
+    t_throughput = (total_tokens_per_round * RUNS) / (end - start)
+
+    # =======================================================
+    # InfiniLM Run
+    # =======================================================
+    print(">>> Injecting Cache to InfiniLM...")
+    for i, kv in enumerate(initial_kv):
+        if kv is not None:
+            k_cache, v_cache = kv
+            k_cache = k_cache.squeeze(0)
+            v_cache = v_cache.squeeze(0)
+            infinilm_model.inject_cache(0, i, k_cache, v_cache)
+            
+    curr_input_np = input_np.copy()
+    curr_past_lens_np = np.array(past_lens, dtype=np.int32)
+    curr_pos_ids_np = np.array(pos_ids, dtype=np.int32)
+    
+    start = time.time()
+    infini_out = None
+    
+    for run_idx in range(RUNS):
+        out_np = infinilm_model.forward(curr_input_np, batch_size, seq_lens, curr_past_lens_np, curr_pos_ids_np, return_raw=False)
+        
+        if run_idx < RUNS - 1:
+            # Update inputs for next round
+            curr_input_np = out_np
+            
+            curr_past_lens_np = [x + 1 for x in curr_past_lens_np]
+            curr_pos_ids_np = [x + 1 for x in curr_pos_ids_np]
+                
+        infini_out = out_np
+        
+    torch.cuda.synchronize()
+    end = time.time()
+    i_throughput = (total_tokens_per_round * RUNS) / (end - start)
+
+    print(f"Throughput: Torch={t_throughput:.1f} tok/s, Infini={i_throughput:.1f} tok/s")
+    check_correctness_decode(torch_out_list, infini_out, device)
 
 if __name__ == "__main__":
+    
     args = get_args()
-    print(args)
-
-    model_path = args.model_path
-    dtype = torch.bfloat16
-
-    # Parse command line arguments
-    device = "cpu"
-    if args.cpu:
-        device = "cpu"
-    elif args.nvidia:
-        device = "cuda"
-    elif args.metax:
-        device = "cuda"
-    elif args.moore:
-        device = "musa"
-        import torch_musa
-    elif args.iluvatar:
-        device = "cuda"
-    else:
-        print(
-            "Usage:  python test/models/qwen3_moe/attention_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
-        )
-        sys.exit(1)
-
-    # -----------------------------------------------------------------------------
-    # -----------------------------------------------------------------------------
-    # -----------------------------------------------------------------------------
-    model, rotary_emb = create_Qwen3attention_torch(
-        model_path, device=device, dtype=dtype
-    )
-    print("\n")
-    print("*" * 130)
-    print("Test Qwen3attention ")
-    print("*" * 130)
-    print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
-    output_prefill = benchmark_Qwen3attention_prefill_torch(
-        model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
-    )
-
-    print("\n")
-    print("-" * 130)
-    print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
-    output_decode = benchmark_Qwen3attention_decode_torch(
-        model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
-    )
-
-    # clean up device memory
-    del model
-    torch_empty_cache(device)
+    device = "cuda" if args.nvidia else "cpu"
+    
+    torch_model, rotary, cfg = create_Qwen3attention_torch(args.model_path, device)
+    infini_model = InfiniLMWrapper(cfg, torch_model)
+    
+    benchmark_prefill(torch_model, rotary, infini_model, PREFILL_TESTCASES, device, torch.bfloat16)
+    benchmark_decode(torch_model, rotary, infini_model, DECODE_TESTCASES, device, torch.bfloat16)
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index ad636197..4d786cf8 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,58 +1,50 @@
-add_requires("pybind11")
-
 local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
 
-set_toolchains("gcc")
-
--- Add spdlog from third_party directory
-add_includedirs("third_party/spdlog/include")
-
 target("infinicore_infer")
     set_kind("shared")
 
+    -- 【关键 1】启用 CUDA 构建规则
+    add_rules("cuda")
+
+    -- 【关键 2】设置 CUDA 架构和 BF16 支持
+    -- BF16 类型 (__nv_bfloat16) 需要 Compute Capability >= 8.0 (Ampere架构，如 A100, A800, 3090, 4090)
+    -- 如果你的显卡较旧（如 V100/T4），这里需要改为 sm_70 或 sm_75，但可能不支持 bf16 原生指令
+    add_cuflags("-arch=sm_80", "--expt-relaxed-constexpr")
+    
+    if is_mode("release") then
+        set_optimize("fastest")
+    end
+
     add_includedirs("include", { public = false })
-    add_includedirs(INFINI_ROOT.."/include", { public = true })
+    add_includedirs(INFINI_ROOT .. "/include", { public = true })
 
-    add_linkdirs(INFINI_ROOT.."/lib")
+    add_linkdirs(INFINI_ROOT .. "/lib")
     add_links("infiniop", "infinirt", "infiniccl")
 
+    -- 【关键 3】链接 CUDA Runtime 库
+    add_syslinks("cudart") 
+
     set_languages("cxx17")
-    set_warnings("all", "error")
+    
+    -- 【调整】移除 "error"，防止 NVCC 警告导致编译中断
+    set_warnings("all")
 
+    -- 源文件添加
     add_files("src/models/*.cpp")
     add_files("src/models/*/*.cpp")
+    
+    -- 确保包含 Qwen3MoE 下的 C++ 和 CUDA 文件
+    add_files("src/models/Qwen3MoE/*.cpp")
+    add_files("src/models/Qwen3MoE/*.cu") 
+    
     add_files("src/tensor/*.cpp")
     add_files("src/allocator/*.cpp")
     add_files("src/dataloader/*.cpp")
     add_files("src/cache_manager/*.cpp")
+    
     add_includedirs("include")
 
     set_installdir(INFINI_ROOT)
     add_installfiles("include/infinicore_infer.h", {prefixdir = "include"})
     add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"})
-target_end()
-
-target("_infinilm")
-    add_packages("pybind11")
-    set_default(false)
-    add_rules("python.module", {soabi = true})
-    set_languages("cxx17")
-    set_kind("shared")
-
-    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
-
-    -- add_includedirs("csrc", { public = false })
-    -- add_includedirs("csrc/pybind11", { public = false })
-    add_includedirs(INFINI_ROOT.."/include", { public = true })
-    add_includedirs("include", { public = false })
-    -- spdlog is already included globally via add_includedirs at the top
-
-    add_linkdirs(INFINI_ROOT.."/lib")
-    add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")
-
-    -- Add src files
-    add_files("csrc/**.cpp")
-    add_files("csrc/**.cc")
-
-    set_installdir("python/infinilm")
-target_end()
+target_end()
\ No newline at end of file

From 47f2a132bc78f5768e0b22e68bdb7fafc01b0e50 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 00:28:42 +0800
Subject: [PATCH 2/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore        |   4 +-
 README.md         | 113 ++++++++++++++++++++++++++++++++++++++--------
 examples/llama.py |  82 +++++++++++++++++++--------------
 3 files changed, 147 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0c9ef52c..767db187 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Xmake cache
 .xmake/
 build/
+python/infinilm/lib/*.so
 
 # MacOS Cache
 .DS_Store
@@ -10,12 +11,13 @@ build/
 
 # Python
 __pycache__/
+*.egg-info/
 
 # Log
 *.log
 
 # Cache
-cache/
+.cache/
 
 # JSON
 *.json
diff --git a/README.md b/README.md
index 791217cc..350d2d9e 100644
--- a/README.md
+++ b/README.md
@@ -15,19 +15,19 @@ xmake && xmake install
 - 运行模型推理测试
 
 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
+python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
 ```
 
 - 部署模型推理服务
 
 ```bash
-python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
 ```
 
 - 测试模型推理服务性能
 
 ```bash
-python scripts/test_perf.py 
+python scripts/test_perf.py
 ```
 
 - 使用推理服务测试模型困惑度（Perplexity）
@@ -37,21 +37,98 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
 ```
 
 ## 使用方式(新版)
+#### 一、编译并安装 `InfiniCore`
+编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
 
-- 编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
-    
-    - 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
-    - 根据硬件平台，选择 xmake 构建配置
-    - 编译安装InfiniCore
-    - 安装 C++ 库
-    - 安装 Python 包
+- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+- 根据硬件平台，选择 xmake 构建配置
+- 编译安装InfiniCore
+- 安装 C++ 库
+- 安装 Python 包
 
-- 单次推理测试
+
+#### 二、编译并安装  `InfiniLM`
+  - 克隆项目
+
+    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
+
+    ```shell
+    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
+    ```
+
+    或者在普通克隆后进行更新：
+
+    ```shell
+    git submodule update --init --recursive
+    ```
+
+
+  - 安装 InfiniLM Python 包
+    ```bash
+      pip install -e .
+    ```
+
+  - 单次推理测试
     - llama示例
-```bash
-python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
-```
-例如：
-```bash
-python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
-```
\ No newline at end of file
+    ```bash
+    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+    ```
+    - 例如：
+    ```bash
+    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
+    ```
+  - 分布式推理测试
+      - 9g示例
+      ```bash
+    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
+    ```
+
+    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
+    ```bash
+    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
+    ```
+
+  - 运行推理基准测试（C-Eval/MMLU）
+
+    ```bash
+    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
+    ```
+
+    - 参数说明：
+      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
+      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
+      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
+
+    - C-Eval示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 全部科目并输出CSV：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+    - MMLU示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
diff --git a/examples/llama.py b/examples/llama.py
index 611a5866..aa890ca9 100644
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -1,17 +1,15 @@
+import infinicore
+from transformers import AutoTokenizer
+from tokenizers import decoders as _dec
+from infinilm.modeling_utils import get_model_state_dict
+import infinilm
+import argparse
 import sys
 import time
 import os
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
 
-import argparse
-import infinilm
-from infinilm.modeling_utils import get_model_state_dict
-from tokenizers import decoders as _dec
-from transformers import AutoTokenizer
-
-import infinicore
-
 
 def get_args():
     parser = argparse.ArgumentParser(description="run Llama args")
@@ -59,22 +57,35 @@ def get_args():
         default="python",
         help="python or cpp model",
     )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="number of prompts in a batch",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="How are you",
+        help="input prompt",
+    )
+
     return parser.parse_args()
 
 
 def test(
-    prompt,
+    prompts: str | list[str],
     model_path,
     max_new_tokens=100,
-    infini_dtype=infinicore.bfloat16,
     infini_device=infinicore.device("cpu", 0),
-    backend="python",
 ):
+    model_path = os.path.expanduser(model_path)
     # ---------------------------------------------------------------------------- #
     #                        创建模型,
     # ---------------------------------------------------------------------------- #
     model = infinilm.AutoLlamaModel.from_pretrained(
-        model_path, device=infini_device, dtype=infini_dtype, backend=backend
+        model_path,
+        device=infini_device,
     )
 
     # ---------------------------------------------------------------------------- #
@@ -83,19 +94,17 @@ def test(
     model_param_infini = get_model_state_dict(
         model_path,
         device=infini_device,
-        dtype=infini_dtype,
+        dtype=model.config.dtype,
     )
 
-    model.load_state_dict(model_param_infini)
-
-    config = model.config
+    model.load_state_dict(model_param_infini, strict=True)
 
     # ---------------------------------------------------------------------------- #
     #                        创建 tokenizer
     # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-    if "llama" == config.model_type:
+    if "llama" == model.config.model_type:
         backend = getattr(tokenizer, "backend_tokenizer", None)
         target = getattr(backend, "_tokenizer", backend)
         norm = getattr(target, "normalizer", None)
@@ -112,32 +121,39 @@ def test(
                     _dec.Fuse(),
                 ]
             )
+    else:
+        raise ValueError(f"Unsupported model type: {model.config.model_type}")
 
     # ---------------------------------------------------------------------------- #
     #                        token编码
     # ---------------------------------------------------------------------------- #
     # prompt = "山东最高的山是？"
-    input_content = tokenizer.apply_chat_template(
-        conversation=[{"role": "user", "content": prompt}],
-        add_generation_prompt=True,
-        tokenize=False,
-    )
-    print(input_content, end="", flush=True)
-    input_ids = tokenizer.encode(input_content)
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
+    print(input_contents[0], end="", flush=True)
+    input_ids_list = tokenizer.batch_encode_plus(input_contents)[
+        "input_ids"
+    ]  # List: [[1, 1128, 526, 366, 29892]]
 
     # ---------------------------------------------------------------------------- #
     #                        自回归生成
     # ---------------------------------------------------------------------------- #
-    input_ids_list = [input_ids]  # List: [[1, 1128, 526, 366, 29892]]
     input_ids_infini = infinicore.from_list(input_ids_list)
 
     t1 = time.time()
+    print("=================== start generate ====================")
     model.generate(
         input_ids_infini,
         max_new_tokens=max_new_tokens,
-        device=infini_device,
         tokenizer=tokenizer,
-        config=config,
     )
     t2 = time.time()
 
@@ -168,20 +184,20 @@ def test(
             "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
         )
         sys.exit(1)
-    prompt = "山东最高的山是？"
+    prompts = [args.prompt for _ in range(args.batch_size)]
 
     model_path = args.model_path
     max_new_tokens = args.max_new_tokens
     backend = args.backend
 
+    if backend != "python":
+        raise ValueError(f"Unsupported backend: {backend}.")
+
     infini_device = infinicore.device(device_str, 0)
-    infini_dtype = infinicore.bfloat16
 
     test(
-        prompt,
+        prompts,
         model_path,
         max_new_tokens,
         infini_device=infini_device,
-        infini_dtype=infini_dtype,
-        backend=backend,
     )

From b98b8e6610064aaa13f1938ab7bb1d485e2cfba9 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 00:34:21 +0800
Subject: [PATCH 3/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/infinicore_infer/models/deepseek.h    |   1 -
 python/infinilm/__init__.py                   |   4 +-
 python/infinilm/cache_utils.py                |   8 +-
 python/infinilm/generation/utils.py           | 114 +++++-----
 python/infinilm/modeling_utils.py             | 195 +++++++++++++++---
 python/infinilm/models/llama/__init__.py      |  33 +--
 .../models/llama/configuration_llama.py       |  21 +-
 .../infinilm/models/llama/modeling_llama.py   |  95 +++++----
 scripts/jiuge.py                              |   6 +-
 scripts/jiuge_ppl.py                          |   2 +
 scripts/launch_server.py                      |   1 +
 scripts/libinfinicore_infer/__init__.py       |  14 --
 scripts/libinfinicore_infer/base.py           |   1 +
 13 files changed, 333 insertions(+), 162 deletions(-)

diff --git a/include/infinicore_infer/models/deepseek.h b/include/infinicore_infer/models/deepseek.h
index 051637b4..3924c5fe 100644
--- a/include/infinicore_infer/models/deepseek.h
+++ b/include/infinicore_infer/models/deepseek.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-    
 struct DeepSeekV3Weights;
 
 // Function pointer signatures
diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py
index 262fd084..0fbee2ca 100644
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
@@ -1,3 +1,5 @@
 from .models import AutoLlamaModel
+from . import distributed
+from . import cache
 
-__all__ = ["AutoLlamaModel"]
+__all__ = ["AutoLlamaModel", "distributed", "cache"]
diff --git a/python/infinilm/cache_utils.py b/python/infinilm/cache_utils.py
index fbd566e4..3587886b 100644
--- a/python/infinilm/cache_utils.py
+++ b/python/infinilm/cache_utils.py
@@ -65,12 +65,12 @@ def lazy_initialization(self, key_states: infinicore.Tensor):
             self.max_seq_len = max(self.max_position_embeddings, seq_len)
 
             self.keys = infinicore.empty(
-                [batch_size, self.max_seq_len, num_heads, head_dim],
+                (batch_size, self.max_seq_len, num_heads, head_dim),
                 dtype=dtype,
                 device=device,
             )
             self.values = infinicore.empty(
-                [batch_size, self.max_seq_len, num_heads, head_dim],
+                (batch_size, self.max_seq_len, num_heads, head_dim),
                 dtype=dtype,
                 device=device,
             )
@@ -80,12 +80,12 @@ def lazy_initialization(self, key_states: infinicore.Tensor):
             self.max_seq_len = max(self.max_seq_len * 2, self.cache_position + seq_len)
 
             keys_new = infinicore.empty(
-                [batch_size, self.max_seq_len, num_heads, head_dim],
+                (batch_size, self.max_seq_len, num_heads, head_dim),
                 dtype=dtype,
                 device=device,
             )
             values_new = infinicore.empty(
-                [batch_size, self.max_seq_len, num_heads, head_dim],
+                (batch_size, self.max_seq_len, num_heads, head_dim),
                 dtype=dtype,
                 device=device,
             )
diff --git a/python/infinilm/generation/utils.py b/python/infinilm/generation/utils.py
index 4da145cd..00143231 100644
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
@@ -47,18 +47,14 @@ def _get_initial_position_ids(
         self,
         bs: int,
         seq_length: int,
-        device: infinicore.device,
     ) -> infinicore.Tensor:
         """Calculates `position_ids` for the pre-fill stage"""
         position_ids_list = [list(range(0, seq_length)) for i in range(bs)]
 
-        return infinicore.from_list(
-            position_ids_list, dtype=infinicore.int64, device=device
-        )
+        return infinicore.from_list(position_ids_list, dtype=infinicore.int64)
 
     def prepare_inputs_for_generation(
         self,
-        device: infinicore.device,
         past_key_values: Optional[Cache] = None,
         **kwargs,
     ):
@@ -73,18 +69,18 @@ def prepare_inputs_for_generation(
             model_inputs["past_key_values"] = past_key_values
 
         # -------------------------------------------------------------------------- #
-        #                     计算所需的，position_ids
+        #                     计算所需的: position_ids
         # -------------------------------------------------------------------------- #
         current_position_ids = kwargs.get("position_ids", None)
         if current_position_ids is None:
             # prill阶段
             bs, seq_len = kwargs["input_ids"].shape[0:2]
-            model_inputs["position_ids"] = self._get_initial_position_ids(
-                bs, seq_len, device
+            model_inputs["position_ids"] = self._get_initial_position_ids(bs, seq_len)
+            model_inputs["cache_positions"] = infinicore.from_list(
+                [0], dtype=infinicore.int64
             )
-
         else:
-            # decoder 阶段
+            # decode 阶段
             bs, seq_len = current_position_ids.shape
             last_position = current_position_ids.narrow(1, seq_len - 1, 1)
 
@@ -96,13 +92,21 @@ def prepare_inputs_for_generation(
 
             next_position = one_value + last_position
             model_inputs["position_ids"] = next_position
-
+            model_inputs["cache_positions"] = kwargs[
+                "cache_positions"
+            ] + infinicore.from_list(
+                [seq_len],
+                dtype=last_position.dtype,
+                device=last_position.device,
+            )
         # -------------------------------------------------------------------- #
         #                 所需的: token的input_ids
         # -------------------------------------------------------------------- #
-        if kwargs.get("next_token_id", None) is not None:
-            next_token_id = kwargs["next_token_id"]
-            model_inputs["input_ids"] = infinicore.from_list([[next_token_id]])
+        if kwargs.get("next_token_ids", None) is not None:
+            next_token_ids = kwargs["next_token_ids"]
+            model_inputs["input_ids"] = infinicore.from_list(
+                [[id_] for id_ in next_token_ids],
+            )
 
         # -------------------------------------------------------------------- #
         #                 其他
@@ -117,22 +121,20 @@ def generate(
         self,
         input_ids: infinicore.Tensor,
         max_new_tokens: int,
-        device: infinicore.device,
         tokenizer,
-        config,
+        stop_on_eos=True,
         **kwargs,
     ):
         model_kwargs = kwargs
 
-        # -------------------------------------------------------------------- #
-        #                       创建 cache                                      #
-        # -------------------------------------------------------------------- #
-        if self.use_cache:
-            model_kwargs["use_cache"] = True
-            model_kwargs["past_key_values"] = DynamicCache(config=self.config)
-        else:
-            model_kwargs["use_cache"] = False
-            model_kwargs["past_key_values"] = None
+        # Check if this is a cpp backend model (has _model attribute with reset_cache method)
+        if not (hasattr(self, "_model") and hasattr(self._model, "reset_cache")):
+            if self.use_cache:
+                model_kwargs["use_cache"] = True
+                model_kwargs["past_key_values"] = DynamicCache(config=self.config)
+            else:
+                model_kwargs["use_cache"] = False
+                model_kwargs["past_key_values"] = None
 
         # -------------------------------------------------------------------- #
         #                       _sample函数                                     #
@@ -140,9 +142,8 @@ def generate(
         result = self._sample(
             input_ids,
             max_new_tokens=max_new_tokens,
-            device=device,
             tokenizer=tokenizer,
-            config=config,
+            stop_on_eos=stop_on_eos,
             **model_kwargs,
         )
         return result
@@ -151,9 +152,8 @@ def _sample(
         self,
         input_ids: infinicore.Tensor,
         max_new_tokens: int,
-        device: infinicore.device,
         tokenizer,
-        config,
+        stop_on_eos=True,
         **model_kwargs,
     ):
         r"""
@@ -162,17 +162,22 @@ def _sample(
         Parameters:
             input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
             max_new_tokens: Maximum number of new tokens.
-            device: infinicore.device.
             tokenizer: translating data into raw text.
         """
 
         batch_size, seq_len = input_ids.shape[:2]
 
-        eos_token_id = config.eos_token_id
+        eos_token_id = self.config.eos_token_id
         eos_token_id_list = (
             [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
         )
 
+        # Extract sampling parameters from kwargs with defaults
+        random_val = model_kwargs.get("random_val", 0.1)
+        topp = model_kwargs.get("topp", 0.8)
+        topk = model_kwargs.get("topk", 1)
+        temperature = model_kwargs.get("temperature", 1.0)
+
         # -------------------------------------------------------------------------- #
         #                     初始化 position_ids
         # -------------------------------------------------------------------------- #
@@ -188,15 +193,17 @@ def _sample(
             # -------------------------------------------------------------------------- #
             #                     prepare model inputs
             # -------------------------------------------------------------------------- #
-            model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
+            start_time = time.time()
+            model_inputs = self.prepare_inputs_for_generation(**model_kwargs)
 
             model_kwargs["position_ids"] = model_inputs["position_ids"]
+            model_kwargs["cache_positions"] = model_inputs["cache_positions"]
 
             # -------------------------------------------------------------------------- #
             #                     计算一次
             # -------------------------------------------------------------------------- #
-            start_time = time.time()
             logits = self(**model_inputs)
+            infinicore.sync_device()
 
             # -------------------------------------------------------------------------- #
             #                     处理输出
@@ -213,43 +220,56 @@ def _sample(
                 dtype=infinicore.int32,
                 device=token_scores.device,
             )
+
             for i in range(0, batch_size):
-                score = token_scores.narrow(0, i, 1).view([vocab_size])
+                score = token_scores.narrow(0, i, 1).view((vocab_size,))
                 out = next_tokens.narrow(0, i, 1).view([])
                 infinicore.nn.functional.random_sample(
                     score,
-                    0.8,
-                    0.1,
-                    1,
-                    1.0,
+                    random_val,
+                    topp,
+                    topk,
+                    temperature,
                     out=out,
                 )
 
             infinicore.sync_stream()  # 计算结束前需要同步
-
-            end_time = time.time()
-            time_list.append((end_time - start_time) * 1000)
-
             # ----------------------------------------------------------------- #
             #                得到下一个token的id，并解码为字符
             # ----------------------------------------------------------------- #
             token_id = next_tokens.to_numpy()[0]
             output_str = tokenizer.decode([token_id], skip_special_tokens=True)
 
-            model_kwargs["next_token_id"] = token_id
+            model_kwargs["next_token_ids"] = next_tokens.to_numpy().tolist()
             output_tokens_list.append(token_id)
             output_content += output_str
 
+            end_time = time.time()
+            time_list.append((end_time - start_time))
+
             print(output_str, end="", flush=True)
-            if token_id in eos_token_id_list:
+            if stop_on_eos and token_id in eos_token_id_list:
                 break
-
         print("\n</s>")
+        print(f"\n\n\n Generation completed in {round(sum(time_list) * 1000, 2)} ms")
         print(
-            f"\n\n\n Time per step:  prefill {round(time_list[0], 2)} token/ms\n",
+            f" Batchsize={batch_size}  Per_Batch_Input_Len={seq_len}  Per_Batch_New_Tokens={len(time_list)}\n"
         )
         print(
-            f" Time per step:  decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
+            f" Prefill TTFT: {round(time_list[0], 2)}ms  Throughput: {round((batch_size * seq_len) / time_list[0], 2)}tok/s\n",
         )
+        if len(time_list) > 1:
+            print(
+                f" Decode  Avg ITL: {round(sum(time_list[1:]) * 1000 / (len(time_list) - 1), 2)}ms   Throughput: {round((batch_size * (len(time_list) - 1)) / sum(time_list[1:]), 2)}tok/s\n",
+            )
 
+        return {
+            "output_token_ids": output_tokens_list,
+            "output_content": output_content,
+            "total_latency": sum(time_list),
+            "prefill_latency": time_list[0],
+            "decode_latency": sum(time_list[1:]),
+            "total_input_tokens": batch_size * seq_len,
+            "total_output_tokens": len(time_list),
+        }
         return output_tokens_list, output_content
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index 9b1c6c87..792aa503 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -1,10 +1,10 @@
 import os
-from typing import Dict, Optional, Union
-
+from typing import Dict, Union
+import time
 import torch
 from safetensors import safe_open
 import glob
-
+from tqdm import tqdm
 import infinicore
 
 str_to_torch_dtype = {
@@ -23,15 +23,41 @@
 }
 
 
+def check_parameters(model_keys: list, already_loaded_keys: list):
+    model_keys = set(model_keys)
+    already_loaded_keys = set(already_loaded_keys)
+    intersection = model_keys & already_loaded_keys
+
+    missing_keys = model_keys - intersection
+    unexpected_keys = already_loaded_keys - intersection
+    error_msgs: list[str] = []
+
+    if len(unexpected_keys) > 0:
+        error_msgs.append(
+            "Unexpected key(s) in state_dict: {}. ".format(
+                ", ".join('"{}"'.format(k) for k in unexpected_keys)
+            )
+        )
+    if len(missing_keys) > 0:
+        error_msgs.append(
+            "Missing key(s) in state_dict: {}. ".format(
+                ", ".join('"{}"'.format(k) for k in missing_keys)
+            )
+        )
+
+    if len(error_msgs) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict\n\t{}".format("\n\t".join(error_msgs))
+        )
+
+
 def load_state_dict(
-    checkpoint_file: Union[str, os.PathLike],
-    map_location: Optional[Union[str, torch.device]] = "cpu",
-    weights_only: bool = True,
+    checkpoint_file: Union[str, os.PathLike], device="cpu", dtype=torch.bfloat16
 ) -> Dict[str, torch.Tensor]:
     """
     Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
     """
-    # Use safetensors if possible
+
     if not checkpoint_file.endswith(".safetensors"):
         return {}
 
@@ -49,20 +75,7 @@ def load_state_dict(
             )
 
         for k in f.keys():
-            if map_location == "meta":
-                _slice = f.get_slice(k)
-                k_dtype = _slice.get_dtype()
-                if k_dtype in str_to_torch_dtype:
-                    dtype = str_to_torch_dtype[k_dtype]
-                else:
-                    raise ValueError(
-                        f"Cannot load safetensors of unknown dtype {k_dtype}"
-                    )
-                state_dict[k] = torch.empty(
-                    size=_slice.get_shape(), dtype=dtype, device="meta"
-                )
-            else:
-                state_dict[k] = f.get_tensor(k)
+            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
 
     return state_dict
 
@@ -75,30 +88,144 @@ def get_model_state_dict(
     """
     Load the model weights.
     """
+
+    print(" read weights ......")
+    t1 = time.time()
+
+    torch_device = device.type
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+
     # --------------------------------------------------------- #
-    #          使用从 *.safetensors文件中加载权重
+    #          Load weights from  all *.safetensors files
     # --------------------------------------------------------- #
     model_param = {}
     for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
-        model_param.update(load_state_dict(file_path))
+        model_param.update(
+            load_state_dict(file_path, device=torch_device, dtype=torch_dtype)
+        )
 
     if model_param.get("lm_head.weight", None) is None:
         model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
 
     # --------------------------------------------------------- #
-    #          调整权重的device和dtype
+    #         model_param_infini references torch.Tensor
     # --------------------------------------------------------- #
-    torch_device = device.type
-    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
-
     model_param_infini = {}
-    for key, value in model_param.items():
-        model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
-
-    # --------------------------------------------------------- #
-    #           model_param_infini 引用torch.Tensor
-    # --------------------------------------------------------- #
-    for key, value in model_param.items():
+    for key in model_param.keys():
         model_param_infini[key] = infinicore.from_torch(model_param[key])
 
+    t2 = time.time()
+    print(f" read weights over! {(t2 - t1) * 1000} ms \n")
     return model_param_infini
+
+
+def load_model_state_dict_by_file(
+    model: infinicore.nn.Module,
+    model_path: str,
+    dtype=infinicore.dtype,
+) -> Dict[str, infinicore.Tensor]:
+    """
+    Load the model weights from file.
+    """
+    print(" load weights ......")
+    t1 = time.time()
+
+    torch_device = "cpu"
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+    model_keys = model.state_dict_keyname()
+
+    already_loaded_keys = []
+
+    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
+    if len(file_list) > 0:
+        for file_path in tqdm(file_list, desc="Processing files"):
+            tqdm.write(f"Processing: {os.path.basename(file_path)}")
+
+            # --------------------------------------------------------- #
+            #          Load weights from *.safetensors file
+            # --------------------------------------------------------- #
+            model_param = load_state_dict(
+                file_path, device=torch_device, dtype=torch_dtype
+            )
+            already_loaded_keys.extend(model_param.keys())
+
+            # --------------------------------------------------------- #
+            #         model_param_infini references torch.Tensor
+            # --------------------------------------------------------- #
+            model_param_infini = {}
+            for key in model_param.keys():
+                model_param_infini[key] = infinicore.from_torch(model_param[key])
+
+            model.load_state_dict(model_param_infini, strict=False)
+            infinicore.sync_device()
+
+    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
+        file_path = os.path.join(model_path, "pytorch_model.bin")
+        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
+
+        model_param_infini = {}
+        for key in model_params.keys():
+            model_param_infini[key] = infinicore.from_torch(
+                model_params[key].to(dtype=torch_dtype)
+            )
+
+            already_loaded_keys.append(key)
+
+        model.load_state_dict(model_param_infini, strict=True)
+        infinicore.sync_device()
+    else:
+        raise KeyError("Weight file not found.")
+
+    check_parameters(model_keys, already_loaded_keys)
+
+    t2 = time.time()
+    print(f" load weights over! {(t2 - t1) * 1000} ms \n")
+
+
+def load_model_state_dict_by_tensor(
+    model: infinicore.nn.Module,
+    model_path: str,
+    dtype=infinicore.dtype,
+):
+    """
+    Load the model weights by tensor.
+    """
+
+    print(" load weights ......")
+    t1 = time.time()
+
+    torch_dtype = infinicore.utils.to_torch_dtype(dtype)
+    model_keys = model.state_dict_keyname()
+    already_loaded_keys = []
+
+    file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
+    if len(file_list) > 0:
+        for file_path in tqdm(file_list, desc="Processing files"):
+            tqdm.write(f"Processing: {os.path.basename(file_path)}")
+
+            with safe_open(file_path, "pt", "cpu") as f:
+                for name in f.keys():
+                    weight_infini = infinicore.from_torch(
+                        f.get_tensor(name).to(dtype=torch_dtype)
+                    )
+                    model.load_param(name, weight_infini)
+                    already_loaded_keys.append(name)
+                    infinicore.sync_stream()
+
+    elif os.path.exists(os.path.join(model_path, "pytorch_model.bin")):
+        file_path = os.path.join(model_path, "pytorch_model.bin")
+        model_params = torch.load(file_path, weights_only=True, map_location="cpu")
+
+        for key in model_params.keys():
+            weight_infini = infinicore.from_torch(
+                model_params[key].to(dtype=torch_dtype)
+            )
+            model.load_param(key, weight_infini)
+            already_loaded_keys.append(key)
+    else:
+        raise KeyError("Weight file not found.")
+
+    check_parameters(model_keys, already_loaded_keys)
+
+    t2 = time.time()
+    print(f" load weights over! {(t2 - t1) * 1000} ms \n")
diff --git a/python/infinilm/models/llama/__init__.py b/python/infinilm/models/llama/__init__.py
index 872f657c..f3b4bbd4 100644
--- a/python/infinilm/models/llama/__init__.py
+++ b/python/infinilm/models/llama/__init__.py
@@ -1,6 +1,8 @@
 import os
 from typing import Optional, Union
 import infinicore
+import time
+from . import modeling_llama
 
 __all__ = ["AutoLlamaModel"]
 
@@ -12,24 +14,23 @@ def from_pretrained(
         model_path: Optional[Union[str, os.PathLike]],
         device: infinicore.device,
         dtype=infinicore.dtype,
-        backend="python",
+        **kwargs,
     ):
-        if backend == "python":
-            from . import modeling_llama
+        t1 = time.time()
 
-            return modeling_llama.LlamaForCausalLM.from_pretrained(
-                model_path,
-                device=device,
-                dtype=dtype,
-            )
+        print("\n***************************************************************")
+        print("\t Loading Llama Model")
+        print(f"\t Device: {device}, DType: {dtype}")
+        print("***************************************************************\n")
+        print(" create model ......")
 
-        elif backend == "cpp":
-            from .backends import cpp
+        instance = modeling_llama.LlamaForCausalLM.from_pretrained(
+            model_path,
+            device=device,
+            **kwargs,
+        )
 
-            return cpp.LlamaForCausalLM.from_pretrained(
-                model_path,
-                device=device,
-                dtype=dtype,
-            )
+        t2 = time.time()
+        print(f" create model over! {(t2 - t1) * 1000} ms \n")
 
-        raise KeyError("invalid backend")
+        return instance
diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py
index 12eec8dd..abc349c7 100644
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
@@ -15,10 +15,14 @@
 
 """LLaMA model configuration"""
 
+import infinicore
+
+from infinilm.lib import _infinilm
+
 from ...configuration_utils import PretrainedConfig
 
 
-class LlamaConfig(PretrainedConfig):
+class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
     r"""
     This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -166,19 +170,22 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=True,
-        pad_token_id=None,
+        pad_token_id=-1,
         bos_token_id=1,
         eos_token_id=2,
         pretraining_tp=1,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
-        attention_bias=False,
+        attention_bias=True,
         attention_dropout=0.0,
         mlp_bias=False,
         head_dim=None,
+        torch_dtype=None,
         **kwargs,
     ):
+        _infinilm.LlamaConfig.__init__(self)
+
         # ---
         self.model_type = "llama"
         self.name_or_path = ""
@@ -221,7 +228,13 @@ def __init__(
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         # rope_config_validation(self)
 
-        super().__init__(
+        if torch_dtype in {"float32", "bfloat16", "float16"}:
+            self.dtype = getattr(infinicore, torch_dtype)
+            self._dtype = self.dtype._underlying
+        else:
+            raise ValueError(f"Unsupported dtype: {torch_dtype}")
+
+        PretrainedConfig.__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
diff --git a/python/infinilm/models/llama/modeling_llama.py b/python/infinilm/models/llama/modeling_llama.py
index 8c91aa39..5b6d9da7 100644
--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
@@ -17,7 +17,6 @@
 import os
 from typing import Optional, Union
 
-from transformers.utils import logging
 
 import infinicore
 
@@ -25,8 +24,6 @@
 from ...generation.utils import GenerationMixin
 from .configuration_llama import LlamaConfig
 
-logger = logging.get_logger(__name__)
-
 
 def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
     total_seq_len, num_heads, head_dim = keys.shape
@@ -49,7 +46,7 @@ def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
 
 def multi_head_attention(
     querys: infinicore.Tensor,  # [seq_len,       num_heads, head_dim]
-    keys: infinicore.Tensor,  #   [total_seq_len, num_heads, head_dim]
+    keys: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
     values: infinicore.Tensor,  # [total_seq_len, num_heads, head_dim]
     scaling: float,
 ):
@@ -62,13 +59,8 @@ def multi_head_attention(
 
     # [num_heads, seq_len, head_dim] @ [ num_heads, head_dim, total_seq_len]
     # => [ num_heads, seq_len, total_seq_len]
-    attn_weight = Q @ K.permute((1, 2, 0))
-
-    scaling = infinicore.from_list(
-        [scaling], dtype=attn_weight.dtype, device=attn_weight.device
-    ).as_strided(attn_weight.shape, [0, 0, 0])
-
-    attn_weight = attn_weight * scaling
+    # Q @ K.T *scaling
+    attn_weight = infinicore.matmul(Q, K.permute((1, 2, 0)), alpha=scaling)
 
     infinicore.nn.functional.causal_softmax(attn_weight, out=attn_weight)
 
@@ -81,9 +73,11 @@ def multi_head_attention(
 
 
 def grouped_query_attention(
-    querys: infinicore.Tensor,  # [seq_len,       num_attention_heads, head_dim]
-    keys: infinicore.Tensor,  #   [total_seq_len, num_key_value_heads, head_dim]
-    values: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
+    # [seq_len,       num_attention_heads, head_dim]
+    querys: infinicore.Tensor,
+    keys: infinicore.Tensor,  # [total_seq_len, num_key_value_heads, head_dim]
+    # [total_seq_len, num_key_value_heads, head_dim]
+    values: infinicore.Tensor,
     scaling: float,
 ):
     num_attention_heads = querys.shape[1]
@@ -104,15 +98,16 @@ def __init__(self, config, **kwargs):
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
         mlp_bias = config.mlp_bias
+        dtype = config.dtype
 
         self.gate_proj = infinicore.nn.Linear(
-            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
+            hidden_size, intermediate_size, bias=mlp_bias, dtype=dtype, **kwargs
         )
         self.up_proj = infinicore.nn.Linear(
-            hidden_size, intermediate_size, bias=mlp_bias, **kwargs
+            hidden_size, intermediate_size, bias=mlp_bias, dtype=dtype, **kwargs
         )
         self.down_proj = infinicore.nn.Linear(
-            intermediate_size, hidden_size, bias=mlp_bias, **kwargs
+            intermediate_size, hidden_size, bias=mlp_bias, dtype=dtype, **kwargs
         )
         self.act_fn = infinicore.nn.functional.silu
 
@@ -139,10 +134,13 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
 
         self.scaling = self.head_dim**-0.5
 
+        dtype = config.dtype
+
         self.q_proj = infinicore.nn.Linear(
             self.hidden_size,
             self.num_attention_heads * self.head_dim,
             bias=attention_bias,
+            dtype=dtype,
             **kwargs,
         )
 
@@ -150,6 +148,7 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
             self.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=attention_bias,
+            dtype=dtype,
             **kwargs,
         )
 
@@ -157,16 +156,20 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
             self.hidden_size,
             self.num_key_value_heads * self.head_dim,
             bias=attention_bias,
+            dtype=dtype,
             **kwargs,
         )
 
         self.o_proj = infinicore.nn.Linear(
             self.num_attention_heads * self.head_dim,
             self.hidden_size,
-            bias=attention_bias,
+            bias=False,
+            dtype=dtype,
             **kwargs,
         )
 
+        self.attn_output = None  # Variable reuse
+
     def forward(
         self,
         hidden_states: infinicore.Tensor,
@@ -175,14 +178,14 @@ def forward(
         **kwargs,
     ) -> infinicore.Tensor:
         hidden_states_shape = hidden_states.shape  # [bs, seq_len, hidden_size]
-        bs, seq_len = hidden_states_shape[:-1]  #    [bs, seq_len]
+        bs, seq_len = hidden_states_shape[:-1]  # [bs, seq_len]
 
         querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
         keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
         values_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
 
         # --------------------------------------------------------------------------------------- #
-        #                           对 Q,K，V进行 project
+        #                           对 Q,K,V进行 project
         # --------------------------------------------------------------------------------------- #
         # => [bs, seq_len,  num_attention_heads, head_dim]
         query_states = self.q_proj(hidden_states).view(querys_shape)
@@ -194,13 +197,9 @@ def forward(
         value_states = self.v_proj(hidden_states).view(values_shape)
 
         # --------------------------------------------------------------------------------------- #
-        #                           对 Q和K， 加上 rope
+        #                           对 Q和K 加上 rope
         # --------------------------------------------------------------------------------------- #
         position_ids = kwargs.pop("position_ids", None)
-        if position_ids is None:
-            raise KeyError("position_ids error")
-        if rope_instance is None:
-            raise KeyError("rope_instance error")
 
         query_states = rope_instance(query_states, position_ids)
         key_states = rope_instance(key_states, position_ids)
@@ -221,7 +220,14 @@ def forward(
         #                           注意力计算
         # --------------------------------------------------------------------------------------- #
         total_seq_len = key_states_total.shape[1]
-        attn_output = infinicore.empty_like(query_states)
+
+        if self.attn_output is None or self.attn_output.shape[1] != seq_len:
+            self.attn_output = infinicore.empty(
+                (bs, seq_len, self.num_attention_heads, self.head_dim),
+                dtype=query_states.dtype,
+                device=query_states.device,
+            )
+
         for i in range(0, bs):
             query_states_i = query_states.narrow(0, i, 1).view(
                 (seq_len, self.num_attention_heads, self.head_dim)
@@ -233,7 +239,7 @@ def forward(
                 (total_seq_len, self.num_key_value_heads, self.head_dim)
             )
 
-            attn_output_i = attn_output.narrow(0, i, 1).view(
+            attn_output_i = self.attn_output.narrow(0, i, 1).view(
                 (seq_len, self.num_attention_heads, self.head_dim)
             )
 
@@ -247,8 +253,9 @@ def forward(
         #                           out project
         # --------------------------------------------------------------------------------------- #
         # ([bs, seq_len, num_attention_heads, head_dim]) ==> [bs, seq_len, hidden_size ]
-        attn_output = attn_output.view(hidden_states_shape)
-
+        attn_output = self.attn_output.view(
+            (bs, seq_len, self.num_attention_heads * self.head_dim)
+        )
         # o_proj
         return self.o_proj(attn_output)
 
@@ -258,13 +265,16 @@ def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
         super().__init__()
         hidden_size = config.hidden_size
         rms_norm_eps = config.rms_norm_eps
+        dtype = config.dtype
 
         self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx, **kwargs)
         self.mlp = LlamaMLP(config=config, **kwargs)
 
-        self.input_layernorm = LlamaRMSNorm(hidden_size, eps=rms_norm_eps, **kwargs)
+        self.input_layernorm = LlamaRMSNorm(
+            hidden_size, eps=rms_norm_eps, dtype=dtype, **kwargs
+        )
         self.post_attention_layernorm = LlamaRMSNorm(
-            hidden_size, eps=rms_norm_eps, **kwargs
+            hidden_size, eps=rms_norm_eps, dtype=dtype, **kwargs
         )
 
     def forward(
@@ -290,7 +300,7 @@ def forward(
             **kwargs,
         )
 
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         # ------------------------------------------------ #
         #           Fully Connected
@@ -301,7 +311,7 @@ def forward(
 
         hidden_states = self.mlp(hidden_states)
 
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         return hidden_states
 
@@ -317,7 +327,7 @@ def __init__(self, config: LlamaConfig, **kwargs):
         )
 
         self.embed_tokens = infinicore.nn.Embedding(
-            config.vocab_size, config.hidden_size, **kwargs
+            config.vocab_size, config.hidden_size, dtype=config.dtype, **kwargs
         )
 
         self.layers = infinicore.nn.ModuleList(
@@ -326,12 +336,15 @@ def __init__(self, config: LlamaConfig, **kwargs):
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps, **kwargs)
+        self.norm = LlamaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dtype=config.dtype, **kwargs
+        )
 
         self.rope_instance = infinicore.nn.RoPE(
             max_position_embeddings=config.max_position_embeddings,
             rope_theta=config.rope_theta,
             head_dim=head_dim,
+            dtype=config.dtype,
             **kwargs,
         )
 
@@ -373,7 +386,10 @@ def forward(
         #                    norm
         # --------------------------------------------------------- #
         seq_len = hidden_states.shape[1]
-        last_token = hidden_states.narrow(1, seq_len - 1, 1)
+        if seq_len > 1:
+            last_token = hidden_states.narrow(1, seq_len - 1, 1)
+        else:
+            last_token = hidden_states
 
         return self.norm(last_token)
 
@@ -391,8 +407,10 @@ def __init__(self, config, **kwargs):
             config.hidden_size,
             config.vocab_size,
             bias=False,
+            dtype=config.dtype,
             **kwargs,
         )
+        self.device = kwargs.get("device", infinicore.device("cpu"))
 
     def forward(
         self,
@@ -404,7 +422,7 @@ def forward(
     ):
         last_token = self.model(
             input_ids,
-            position_ids,
+            position_ids.to(self.device),
             past_key_values=past_key_values,
             use_cache=use_cache,
             **kwargs,
@@ -416,7 +434,6 @@ def from_pretrained(
         cls,
         model_path: Optional[Union[str, os.PathLike]],
         device: infinicore.device,
-        dtype=infinicore.dtype,
     ):
         def load_config_json(dir_path_: str):
             with open(os.path.join(dir_path_, "config.json"), "r") as f:
@@ -426,4 +443,4 @@ def load_config_json(dir_path_: str):
         config_dict = load_config_json(os.path.join(model_path))
         config = LlamaConfig(**config_dict)
 
-        return LlamaForCausalLM(config, device=device, dtype=dtype)
+        return LlamaForCausalLM(config, device=device)
diff --git a/scripts/jiuge.py b/scripts/jiuge.py
index 7c31baf8..e50ea327 100644
--- a/scripts/jiuge.py
+++ b/scripts/jiuge.py
@@ -825,7 +825,7 @@ def destroy_model_instance(self):
 def test():
     if len(sys.argv) < 3:
         print(
-            "Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
+            "Usage: python jiuge.py [--cpu | --nvidia| --qy| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
         )
         sys.exit(1)
 
@@ -844,6 +844,8 @@ def test():
         device_type = DeviceType.DEVICE_TYPE_CPU
     elif sys.argv[1] == "--nvidia":
         device_type = DeviceType.DEVICE_TYPE_NVIDIA
+    elif sys.argv[1] == "--qy":
+        device_type = DeviceType.DEVICE_TYPE_QY
     elif sys.argv[1] == "--cambricon":
         device_type = DeviceType.DEVICE_TYPE_CAMBRICON
     elif sys.argv[1] == "--ascend":
@@ -860,7 +862,7 @@ def test():
         device_type = DeviceType.DEVICE_TYPE_HYGON
     else:
         print(
-            "Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
+            "Usage: python jiuge.py [--cpu | --nvidia| --qy| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device] [--verbose]"
         )
         sys.exit(1)
 
diff --git a/scripts/jiuge_ppl.py b/scripts/jiuge_ppl.py
index 061ab303..923d209c 100644
--- a/scripts/jiuge_ppl.py
+++ b/scripts/jiuge_ppl.py
@@ -7,6 +7,7 @@
 DEVICE_TYPE_MAP = {
     "cpu": DeviceType.DEVICE_TYPE_CPU,
     "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+    "qy": DeviceType.DEVICE_TYPE_QY,
     "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
     "ascend": DeviceType.DEVICE_TYPE_ASCEND,
     "metax": DeviceType.DEVICE_TYPE_METAX,
@@ -19,6 +20,7 @@
 TORCH_DEVICE_TYPE_MAP = {
     "cpu": "cpu",
     "nvidia": "cuda",
+    "qy": "cuda",
     "cambricon": "mlu",
     "ascend": "npu",
     "metax": "cuda",
diff --git a/scripts/launch_server.py b/scripts/launch_server.py
index 2d231b49..659163c6 100644
--- a/scripts/launch_server.py
+++ b/scripts/launch_server.py
@@ -20,6 +20,7 @@
 DEVICE_TYPE_MAP = {
     "cpu": DeviceType.DEVICE_TYPE_CPU,
     "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+    "qy": DeviceType.DEVICE_TYPE_QY,
     "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
     "ascend": DeviceType.DEVICE_TYPE_ASCEND,
     "metax": DeviceType.DEVICE_TYPE_METAX,
diff --git a/scripts/libinfinicore_infer/__init__.py b/scripts/libinfinicore_infer/__init__.py
index 66feee7f..8fc5f4db 100644
--- a/scripts/libinfinicore_infer/__init__.py
+++ b/scripts/libinfinicore_infer/__init__.py
@@ -8,14 +8,6 @@
     DeepSeekV3WeightLoaderCStruct,
     DeepSeekV3CacheCStruct,
 )
-from .qwen3_moe import (
-    Qwen3MoEModel,
-    Qwen3MoEAttentionMetaCStruct,
-    Qwen3MoEWeightsCStruct,
-    Qwen3MoEWeightLoaderCStruct,
-    Qwen3MoEAttentionCStruct,
-    Qwen3CacheCStruct,
-)
 
 __all__ = [
     "DataType",
@@ -31,11 +23,5 @@
     "DeepSeekV3MetaCStruct",
     "DeepSeekV3WeightsCStruct",
     "DeepSeekV3WeightLoaderCStruct",
-    "Qwen3MoEModel",
-    "Qwen3MoEAttentionMetaCStruct",
-    "Qwen3MoEWeightsCStruct",
-    "Qwen3MoEWeightLoaderCStruct",
-    "Qwen3MoEAttentionCStruct",
-    "Qwen3CacheCStruct",
     "ModelRegister",
 ]
diff --git a/scripts/libinfinicore_infer/base.py b/scripts/libinfinicore_infer/base.py
index bed65b2e..3305cdba 100644
--- a/scripts/libinfinicore_infer/base.py
+++ b/scripts/libinfinicore_infer/base.py
@@ -36,6 +36,7 @@ class DeviceType(ctypes.c_int):
     DEVICE_TYPE_ILUVATAR = 6
     DEVICE_TYPE_KUNLUN = 7
     DEVICE_TYPE_HYGON = 8
+    DEVICE_TYPE_QY = 9
 
 
 class KVCacheCStruct(ctypes.Structure):

From 655667f9bc79e467352c7af11b22847956695865 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 00:36:50 +0800
Subject: [PATCH 4/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/models/deepseek_v3/deepseek_v3.cpp        |   1 +
 src/models/deepseek_v3/deepseek_v3_cache.cpp  |   1 -
 src/models/deepseek_v3/deepseek_v3_impl.hpp   | 201 +++++++++---------
 src/models/deepseek_v3/deepseek_v3_weight.cpp |   1 +
 src/models/inference_context.cpp              |  15 --
 5 files changed, 102 insertions(+), 117 deletions(-)

diff --git a/src/models/deepseek_v3/deepseek_v3.cpp b/src/models/deepseek_v3/deepseek_v3.cpp
index 8292d20b..2c463035 100644
--- a/src/models/deepseek_v3/deepseek_v3.cpp
+++ b/src/models/deepseek_v3/deepseek_v3.cpp
@@ -8,6 +8,7 @@
 #include <random>
 #include <thread>
 #include <vector>
+
 void createDeviceResource(DeepSeekV3DeviceResource *rsrc, const DeepSeekV3Meta *meta,
                           std::shared_ptr<DeepSeekV3DeviceWeights> weights,
                           infiniDevice_t device, int idev,
diff --git a/src/models/deepseek_v3/deepseek_v3_cache.cpp b/src/models/deepseek_v3/deepseek_v3_cache.cpp
index 6750f19e..a177fd8c 100644
--- a/src/models/deepseek_v3/deepseek_v3_cache.cpp
+++ b/src/models/deepseek_v3/deepseek_v3_cache.cpp
@@ -1,6 +1,5 @@
 #include "deepseek_v3_impl.hpp"
 
-
 __C struct DeepSeekV3Cache *
 createDeepSeekV3Cache(const struct DeepSeekV3Model *model) {
     DeepSeekV3Cache *cache = new DeepSeekV3Cache();
diff --git a/src/models/deepseek_v3/deepseek_v3_impl.hpp b/src/models/deepseek_v3/deepseek_v3_impl.hpp
index aeadefae..d4751074 100644
--- a/src/models/deepseek_v3/deepseek_v3_impl.hpp
+++ b/src/models/deepseek_v3/deepseek_v3_impl.hpp
@@ -12,106 +12,105 @@
 #include <thread>
 #include <vector>
 
-    struct QuantLinearWeight {
-        std::shared_ptr<Tensor> w;
-        std::shared_ptr<Tensor> s;
-        std::shared_ptr<Tensor> z;
-    };
-    
-    struct MLAWeight {
-        std::shared_ptr<Tensor> kv_a_norm, q_a_norm;
-        std::shared_ptr<QuantLinearWeight> kv_a_proj, kv_b_proj, o_proj, q_a_proj, q_b_proj;
-    };
-    
-    struct GateWeight {
-        std::shared_ptr<Tensor> w;
-        std::shared_ptr<Tensor> b;
-    };
-    
-    struct MLPWeight {
-        std::shared_ptr<QuantLinearWeight> gate, up, down;
-    };
-    
-    struct LayerWeight {
-        std::shared_ptr<Tensor> mla_norm;
-        std::shared_ptr<MLAWeight> mla;
-        std::shared_ptr<Tensor> mlp_norm;
-        std::shared_ptr<MLPWeight> dense_mlp;
-        std::shared_ptr<GateWeight> route;
-        std::shared_ptr<MLPWeight> share_expert;
-        std::vector<std::shared_ptr<MLPWeight>> experts;
-    };
-    
-    struct DeepSeekV3DeviceWeights {
-        std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
-            cos_table;
-        std::vector<LayerWeight> w_layers;
-        infiniDevice_t device;
-        int dev_id;
-        infinirtStream_t load_stream;
-    };
-    
-    struct DeepSeekV3Weights {
-        std::vector<std::shared_ptr<DeepSeekV3DeviceWeights>> device_weights;
-    
-        DeepSeekV3Weights(const DeepSeekV3Meta *meta,
-                          infiniDevice_t device,
-                          int ndev,
-                          const int *dev_ids);
-    };
-    
-    struct DeepSeekV3DeviceResource {
-        // Device
-        infiniDevice_t device;
-        int device_id;
-        infiniopHandle_t handle;
-        // Weights
-        std::shared_ptr<DeepSeekV3DeviceWeights> weights;
-        // Streams
-        infinirtStream_t stream;
-        // Communicator
-        infinicclComm_t comm;
-    
-        std::shared_ptr<MemoryPool> memory_pool;
-    };
-    
-    struct InferState {
-        std::mutex mtx;
-        std::condition_variable cv_load, cv_start, cv_done;
-        bool loaded = false;
-        bool proceed = false;
-        bool exit_flag = false;
-    };
-    
-    struct InferRequest {
-        const uint32_t *tokens;
-        uint32_t ntok;
-        const uint32_t *req_lens;
-        uint32_t nreq;
-        const uint32_t *req_pos;
-        struct DeepSeekV3Cache **kv_caches;
-        const float *temperature;
-        const uint32_t *topk;
-        const float *topp;
-        uint32_t *output;
-        void *logits;
-    };
-    
-    struct DeepSeekV3Model {
-        DeepSeekV3Meta meta;
-        infiniDevice_t device;
-        std::vector<int> dev_ids;
-        std::vector<DeepSeekV3DeviceResource> dev_resources;
-        std::vector<InferState> states;
-        std::vector<std::thread> threads;
-        InferRequest req;
-    
-        DeepSeekV3Model(const DeepSeekV3Meta *, const DeepSeekV3Weights *weights);
-    };
-    
-    struct DeepSeekV3Cache {
-        std::vector<std::vector<std::shared_ptr<Tensor>>> kv_pass, k_rot;
-    };
-    
+struct QuantLinearWeight {
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> s;
+    std::shared_ptr<Tensor> z;
+};
+
+struct MLAWeight {
+    std::shared_ptr<Tensor> kv_a_norm, q_a_norm;
+    std::shared_ptr<QuantLinearWeight> kv_a_proj, kv_b_proj, o_proj, q_a_proj, q_b_proj;
+};
+
+struct GateWeight {
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> b;
+};
+
+struct MLPWeight {
+    std::shared_ptr<QuantLinearWeight> gate, up, down;
+};
+
+struct LayerWeight {
+    std::shared_ptr<Tensor> mla_norm;
+    std::shared_ptr<MLAWeight> mla;
+    std::shared_ptr<Tensor> mlp_norm;
+    std::shared_ptr<MLPWeight> dense_mlp;
+    std::shared_ptr<GateWeight> route;
+    std::shared_ptr<MLPWeight> share_expert;
+    std::vector<std::shared_ptr<MLPWeight>> experts;
+};
+
+struct DeepSeekV3DeviceWeights {
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+        cos_table;
+    std::vector<LayerWeight> w_layers;
+    infiniDevice_t device;
+    int dev_id;
+    infinirtStream_t load_stream;
+};
+
+struct DeepSeekV3Weights {
+    std::vector<std::shared_ptr<DeepSeekV3DeviceWeights>> device_weights;
+
+    DeepSeekV3Weights(const DeepSeekV3Meta *meta,
+                      infiniDevice_t device,
+                      int ndev,
+                      const int *dev_ids);
+};
+
+struct DeepSeekV3DeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<DeepSeekV3DeviceWeights> weights;
+    // Streams
+    infinirtStream_t stream;
+    // Communicator
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv_load, cv_start, cv_done;
+    bool loaded = false;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct DeepSeekV3Cache **kv_caches;
+    const float *temperature;
+    const uint32_t *topk;
+    const float *topp;
+    uint32_t *output;
+    void *logits;
+};
+
+struct DeepSeekV3Model {
+    DeepSeekV3Meta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    std::vector<DeepSeekV3DeviceResource> dev_resources;
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    DeepSeekV3Model(const DeepSeekV3Meta *, const DeepSeekV3Weights *weights);
+};
+
+struct DeepSeekV3Cache {
+    std::vector<std::vector<std::shared_ptr<Tensor>>> kv_pass, k_rot;
+};
 
 #endif
diff --git a/src/models/deepseek_v3/deepseek_v3_weight.cpp b/src/models/deepseek_v3/deepseek_v3_weight.cpp
index d55acc44..846af633 100644
--- a/src/models/deepseek_v3/deepseek_v3_weight.cpp
+++ b/src/models/deepseek_v3/deepseek_v3_weight.cpp
@@ -1,6 +1,7 @@
 #include "deepseek_v3_impl.hpp"
 
 #include <cmath>
+
 inline std::shared_ptr<Tensor> getInEmbd(
     const DeepSeekV3Meta *meta) {
     auto shape = std::vector<size_t>({meta->dvoc, meta->d});
diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp
index 2a936db0..db5fda11 100644
--- a/src/models/inference_context.cpp
+++ b/src/models/inference_context.cpp
@@ -64,21 +64,6 @@ void InferenceContext::gemm(std::shared_ptr<Tensor> c,
 
     infiniopGemmDescriptor_t desc;
     if (!cache_manager->getGemmDescriptor(key, desc)) {
-        // Debug: print tensor metadata to help diagnose descriptor creation errors
-        auto print_tensor = [&](const std::shared_ptr<Tensor>& t, const char* name) {
-
-            auto s = t->strides();
-            
-        };
-
-        try {
-            print_tensor(c, "C");
-            print_tensor(a, "A");
-            print_tensor(b, "B");
-        } catch (...) {
-            std::cout << "[InferenceContext::gemm] Failed to print tensor metadata" << std::endl;
-        }
-
         RUN_INFINI(infiniopCreateGemmDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
         cache_manager->putGemmDescriptor(key, desc);
     }

From 8984d02cbc940726c91580887dd2fc53a0dff331 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 00:39:22 +0800
Subject: [PATCH 5/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...47\350\241\214\351\252\214\350\257\201.md" | 22 -----------
 python/infinilm/models/llama/backends/cpp.py  | 38 -------------------
 2 files changed, 60 deletions(-)
 delete mode 100644 ".trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"
 delete mode 100644 python/infinilm/models/llama/backends/cpp.py

diff --git "a/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md" "b/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"
deleted file mode 100644
index 6680a970..00000000
--- "a/.trae/documents/\344\277\256\345\244\215 Softmax \347\262\276\345\272\246\345\271\262\346\211\260\345\271\266\346\211\247\350\241\214\351\252\214\350\257\201.md"	
+++ /dev/null
@@ -1,22 +0,0 @@
-根据您的指令和之前的分析，我制定了以下计划来彻底修复精度问题并进行验证。
-
-核心思路是：不仅在 `inject` 阶段，在 `forward` 阶段的 Gather Buffer 也必须初始化为 `-inf`，以防止 Padding 区域的 0 值干扰 Softmax 计算。
-
-### 1. 代码完善 (C++)
-在 `src/models/Qwen3MoE/Qwen3MoE.cpp` 中继续修改 `forwardQwen3MoEAttention` 函数：
-- 找到 `k_padded_gather` 和 `v_padded_gather` 的初始化代码。
-- 将原本的 `cudaMemsetAsync` (清零) 替换为我们新实现的 `launch_fill_val_bf16` (填充 `-inf`)。
-- **原因**：这是解决 `past=0` 场景下精度下降（0.99 -> 0.98/0.96）的关键，确保 Padding 不会参与 Attention 权重计算。
-
-### 2. 编译与运行
-使用您提供的完整环境命令进行编译和测试：
-```bash
-cd '/data/users/lviy/InfiniLM' ; source /data/apps/env.sh && source /data/apps/miniforge3/etc/profile.d/conda.sh && conda activate py313 && export INFINI_ROOT=$HOME/.infini && export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH && export PATH="/data/apps/xmake/bin:/usr/local/cuda/bin:$PATH" && xmake && srun --gres=gpu:nvidia:1 --cpus-per-task=8 --mem=16G python test/models/qwen3_moe/attention_test.py --model_path "/data/shared/models/Qwen3-30B-A3B-Instruct-2507-Layer-0" --nvidia
-```
-
-### 3. 结果验证
-观察输出日志：
-- **Debug Log**: 确认 `[Inject]` 的分配逻辑是否如预期（Batch 0 Alloc, Batch 3 Reuse）。
-- **Cosine Similarity**: 检查是否恢复到 > 0.99（预期 0.0000 问题应随之解决，因为构建环境已修复）。
-
-如果测试通过，我将删除 Debug Print 并交付最终代码。
\ No newline at end of file
diff --git a/python/infinilm/models/llama/backends/cpp.py b/python/infinilm/models/llama/backends/cpp.py
deleted file mode 100644
index 30b56192..00000000
--- a/python/infinilm/models/llama/backends/cpp.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from ....generation.utils import GenerationMixin
-import infinicore
-import os
-from typing import Optional, Union
-
-
-class LlamaForCausalLM(GenerationMixin):
-    def __init__(self):
-        super().__init__()
-        self.use_cache = False
-        self._model = None
-        raise NotImplementedError("NotImplementedError!!")
-
-    def forward(self, input_ids, position_ids, *args, **kwargs):
-        kv_caches = None
-        return infinicore.Tensor(
-            self._model.forward(input_ids, position_ids, kv_caches)
-        )
-
-    def __call__(self, input_ids, position_ids, *args, **kwargs):
-        return self.forward(input_ids=input_ids, position_ids=position_ids)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_path: Union[str, os.PathLike],
-        device: infinicore.device,
-        dtype=infinicore.dtype,
-    ):
-        """
-        Load a pretrained LlamaForCausalLM model from a directory.
-        Args:
-            model_path: Path to the model directory containing config.json
-            device: Device instance (defaults to CPU)
-        Returns:
-            LlamaForCausalLM instance
-        """
-        raise NotImplementedError("NotImplementedError!!")

From 1f3412e402a0da1a30cebeabb67e5425cb055364 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 00:41:19 +0800
Subject: [PATCH 6/7] =?UTF-8?q?[2025=E7=A7=8B=E5=AD=A3][T2-2-1]=20lviy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 xmake.lua | 64 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/xmake.lua b/xmake.lua
index 4d786cf8..ad636197 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,50 +1,58 @@
+add_requires("pybind11")
+
 local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
 
-target("infinicore_infer")
-    set_kind("shared")
+set_toolchains("gcc")
 
-    -- 【关键 1】启用 CUDA 构建规则
-    add_rules("cuda")
+-- Add spdlog from third_party directory
+add_includedirs("third_party/spdlog/include")
 
-    -- 【关键 2】设置 CUDA 架构和 BF16 支持
-    -- BF16 类型 (__nv_bfloat16) 需要 Compute Capability >= 8.0 (Ampere架构，如 A100, A800, 3090, 4090)
-    -- 如果你的显卡较旧（如 V100/T4），这里需要改为 sm_70 或 sm_75，但可能不支持 bf16 原生指令
-    add_cuflags("-arch=sm_80", "--expt-relaxed-constexpr")
-    
-    if is_mode("release") then
-        set_optimize("fastest")
-    end
+target("infinicore_infer")
+    set_kind("shared")
 
     add_includedirs("include", { public = false })
-    add_includedirs(INFINI_ROOT .. "/include", { public = true })
+    add_includedirs(INFINI_ROOT.."/include", { public = true })
 
-    add_linkdirs(INFINI_ROOT .. "/lib")
+    add_linkdirs(INFINI_ROOT.."/lib")
     add_links("infiniop", "infinirt", "infiniccl")
 
-    -- 【关键 3】链接 CUDA Runtime 库
-    add_syslinks("cudart") 
-
     set_languages("cxx17")
-    
-    -- 【调整】移除 "error"，防止 NVCC 警告导致编译中断
-    set_warnings("all")
+    set_warnings("all", "error")
 
-    -- 源文件添加
     add_files("src/models/*.cpp")
     add_files("src/models/*/*.cpp")
-    
-    -- 确保包含 Qwen3MoE 下的 C++ 和 CUDA 文件
-    add_files("src/models/Qwen3MoE/*.cpp")
-    add_files("src/models/Qwen3MoE/*.cu") 
-    
     add_files("src/tensor/*.cpp")
     add_files("src/allocator/*.cpp")
     add_files("src/dataloader/*.cpp")
     add_files("src/cache_manager/*.cpp")
-    
     add_includedirs("include")
 
     set_installdir(INFINI_ROOT)
     add_installfiles("include/infinicore_infer.h", {prefixdir = "include"})
     add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"})
-target_end()
\ No newline at end of file
+target_end()
+
+target("_infinilm")
+    add_packages("pybind11")
+    set_default(false)
+    add_rules("python.module", {soabi = true})
+    set_languages("cxx17")
+    set_kind("shared")
+
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+    -- add_includedirs("csrc", { public = false })
+    -- add_includedirs("csrc/pybind11", { public = false })
+    add_includedirs(INFINI_ROOT.."/include", { public = true })
+    add_includedirs("include", { public = false })
+    -- spdlog is already included globally via add_includedirs at the top
+
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")
+
+    -- Add src files
+    add_files("csrc/**.cpp")
+    add_files("csrc/**.cc")
+
+    set_installdir("python/infinilm")
+target_end()

From 1c0df22cd51ab68f19c199f8a77bff390cdbd811 Mon Sep 17 00:00:00 2001
From: lviy <lviyptilopsis@gmail.com>
Date: Mon, 12 Jan 2026 03:13:33 +0800
Subject: [PATCH 7/7] =?UTF-8?q?=E5=85=88=E9=93=BE=E6=8E=A5=E4=B8=80?=
 =?UTF-8?q?=E4=B8=8Bcuda=20=E6=8A=8AQwen3Moe.cpp=E9=87=8C=E7=9A=84cuda?=
 =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E6=94=B9=E4=B8=BAinfini=E5=8F=AF=E5=8E=BB?=
 =?UTF-8?q?=E6=8E=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/models/Qwen3MoE/Qwen3MoE.cpp | 90 ++++++++++++++++++--------------
 xmake.lua                        |  5 +-
 2 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/src/models/Qwen3MoE/Qwen3MoE.cpp b/src/models/Qwen3MoE/Qwen3MoE.cpp
index 5aa9ed45..7d044e4b 100644
--- a/src/models/Qwen3MoE/Qwen3MoE.cpp
+++ b/src/models/Qwen3MoE/Qwen3MoE.cpp
@@ -13,19 +13,6 @@
 #include <iostream>
 #include <cuda_runtime.h>
 #include <cuda_bf16.h>
-
-extern "C" void launch_fill_zero(void* data, size_t n_bytes, void* stream);
-extern "C" void launch_prefill_softmax(
-    void* data, 
-    int total_rows,      // NumHeads * CurSeqLen
-    int padded_len,      // Stride
-    int total_seq_len,   // Past + Cur
-    int cur_seq_len,     // Cur
-    int head_num, 
-    void* stream
-);
-extern "C" void launch_decode_softmax(void* data, int rows, int cols, int stride, void* stream);
-
 // =============================================================================
 // Helper Declarations & Utils
 // =============================================================================
@@ -186,7 +173,6 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
         
         // [RESTORED STANDARD LOGIC]
         // 只有当指针为空，或者形状不匹配时，才重新分配！
-        // 这样才能保留 inject_cache 注入的数据
         bool need_alloc = false;
         if (!kv_cache_layer.first || !kv_cache_layer.second) {
             need_alloc = true;
@@ -201,16 +187,19 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
         }
         size_t unit_size = dsize(dt_logits);
         if (need_alloc) {
-            // 只有第一次（或Batch变大时）才进来
             kv_cache_layer.first = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
             kv_cache_layer.second = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
             
-            //  先 forward 一次 warmup 分配内存 -> 然后 inject -> 然后正式 run）
-            size_t unit_size = dsize(dt_logits);
+            // [REVERTED] Use cudaMemsetAsync (Stable)
             size_t num_elements = static_cast<size_t>(batch_size) * num_kv_head * max_seq_len * head_dim;
-            cudaMemsetAsync(kv_cache_layer.first->data(), 0, num_elements * unit_size, (cudaStream_t)stream);
-            cudaMemsetAsync(kv_cache_layer.second->data(), 0, num_elements * unit_size, (cudaStream_t)stream);
-        } //TODO: 把cudaMemsetAsync 0 改为launch fill zero （但是会出现精度问题？）
+            size_t total_bytes = num_elements * unit_size;
+            
+            // [SAFEGUARD] Check size > 0
+            if (total_bytes > 0) {
+                cudaMemsetAsync(kv_cache_layer.first->data(), 0, total_bytes, (cudaStream_t)stream);
+                cudaMemsetAsync(kv_cache_layer.second->data(), 0, total_bytes, (cudaStream_t)stream);
+            }
+        } 
 
         auto k_cache_all = kv_cache_layer.first;
         auto v_cache_all = kv_cache_layer.second;
@@ -265,8 +254,10 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
     auto k_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
     size_t kv_gather_bytes = num_kv_head * padded_len * head_dim * unit_size;
 
-    //  Clear gather buffer
-    cudaMemsetAsync(k_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    // [REVERTED] Use cudaMemsetAsync
+    if (kv_gather_bytes > 0) {
+        cudaMemsetAsync(k_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    }
 
     char* k_gather_src_base = k_cache_base + b * stride_batch_bytes;
     size_t gather_bytes_per_head = total_len * head_dim * unit_size;
@@ -274,7 +265,10 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
     for (size_t h = 0; h < num_kv_head; h++) {
     char* k_src = k_gather_src_base + h * stride_head_bytes;
     char* k_dst = (char*)k_padded_gather->data() + h * dst_head_stride_bytes;
-    cudaMemcpyAsync(k_dst, (void*)k_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    // Keep size check for memcpy
+    if (gather_bytes_per_head > 0) {
+        cudaMemcpyAsync(k_dst, (void*)k_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
     }
 
     auto k_gemm_in = Tensor::buffer(dt_logits, {num_kv_head, head_dim, padded_len}, memory_pool);
@@ -283,15 +277,14 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
     // 3. GEMM 1: Q * K
     auto scores_padded = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, padded_len}, memory_pool);
 
+    // [Scheme A] Zero out the buffer safely
+    size_t scores_bytes = num_kv_head * ngroup * cur_seq_len * padded_len * unit_size;
+    cudaMemsetAsync(scores_padded->data(), 0, scores_bytes, (cudaStream_t)stream);
+
     float scale_factor = 1.0f / sqrt(128.0f); 
     linear(scores_padded, q_gemm, k_gemm_in, scale_factor, 0.f, nullptr, nullptr);
 
-    // 4. Softmax+Scaling+Masking (fused_kernel for NVIDIA)
-    // if (cur_seq_len > 1) {
-    // launch_prefill_softmax(scores_padded->data(), num_heads * cur_seq_len, padded_len, total_len, cur_seq_len, num_heads, (void*)stream);
-    // } else {
-    // launch_decode_softmax(scores_padded->data(), num_heads * cur_seq_len, total_len, padded_len, (void*)stream);
-    // }
+    // 4. Softmax+Scaling+Masking
     auto scores_view = scores_padded->view({num_heads, cur_seq_len, padded_len});
     auto scores_in = scores_view->slice(2, 0, total_len);
     causalSoftmax(scores_in, scores_in);
@@ -300,18 +293,28 @@ void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
         size_t pitch = padded_len * unit_size;
         size_t width = (padded_len - total_len) * unit_size;
         char* dst_ptr = (char*)scores_padded->data() + total_len * unit_size;
-        cudaMemset2DAsync(dst_ptr, pitch, 0, width, num_heads * cur_seq_len, (cudaStream_t)stream);
+        // Keep size check for 2D Memset
+        if (width > 0) {
+            cudaMemset2DAsync(dst_ptr, pitch, 0, width, num_heads * cur_seq_len, (cudaStream_t)stream);
+        }
     }
-   
+    
 
     // 5. GEMM 2
     auto v_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
-    cudaMemsetAsync(v_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    // [REVERTED] Use cudaMemsetAsync
+    if (kv_gather_bytes > 0) {
+        cudaMemsetAsync(v_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    }
+
     char* v_gather_src_base = v_cache_base + b * stride_batch_bytes;
     for (size_t h = 0; h < num_kv_head; h++) {
     char* v_src = v_gather_src_base + h * stride_head_bytes;
     char* v_dst = (char*)v_padded_gather->data() + h * dst_head_stride_bytes;
-    cudaMemcpyAsync(v_dst, (void*)v_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    // Keep size check for memcpy
+    if (gather_bytes_per_head > 0) {
+        cudaMemcpyAsync(v_dst, (void*)v_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
     }
 
     auto attn_out_b = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, head_dim}, memory_pool);
@@ -523,11 +526,12 @@ __C __export void injectQwen3CacheKV(
             std::memcpy(k_dst_addr, k_src_addr, bytes_to_copy_per_head);
             std::memcpy(v_dst_addr, v_src_addr, bytes_to_copy_per_head);
         } else {
-            // [CUDA] Raw API
-            RUN_INFINI(infinirtMemcpyAsync(k_dst_addr, (void*)k_src_addr,
+            if (bytes_to_copy_per_head > 0) {
+                RUN_INFINI(infinirtMemcpyAsync(k_dst_addr, (void*)k_src_addr,
                             bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
-            RUN_INFINI(infinirtMemcpyAsync(v_dst_addr, (void*)v_src_addr,
+                RUN_INFINI(infinirtMemcpyAsync(v_dst_addr, (void*)v_src_addr,
                             bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
+            }
         }
     }
     RUN_INFINI(infinirtStreamSynchronize(stream));
@@ -564,7 +568,6 @@ extern "C" void customInjectCacheKV(
     // 2. 获取 C++ 视角的形状信息
     auto shape = layer.first->shape(); 
     // shape: [Batch, NumKV, MaxSeq, HeadDim]
-    size_t batch_size = shape[0];
     size_t num_kv = shape[1];
     size_t max_seq = shape[2]; // 这里是关键！它是 8192
     size_t head_dim = shape[3]; // 这里应该是 128
@@ -597,10 +600,17 @@ extern "C" void customInjectCacheKV(
         char* k_dst = k_dst_base + h * stride_head ;
         char* v_dst = v_dst_base + h * stride_head ;
 
-        RUN_INFINI(infinirtMemcpyAsync(k_dst, k_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
-        RUN_INFINI(infinirtMemcpyAsync(v_dst, v_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+        // 检查指针是否对齐和越界（简单保护）
+        if (past_len > 0) {
+             RUN_INFINI(infinirtMemcpyAsync(k_dst, k_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+             RUN_INFINI(infinirtMemcpyAsync(v_dst, v_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+        }
     }
     
     // 简单同步确保写入完成
     RUN_INFINI(infinirtStreamSynchronize((infinirtStream_t)stream));
-}
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("DEBUG: Error at customInjectCacheKV end: %s\n", cudaGetErrorString(err));
+    }
+}
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index ad636197..e403c741 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -15,7 +15,8 @@ target("infinicore_infer")
 
     add_linkdirs(INFINI_ROOT.."/lib")
     add_links("infiniop", "infinirt", "infiniccl")
-
+    add_syslinks("cudart")  -- 用了cuda runtime 暂时link一下 后续fix
+    add_rules("cuda") 
     set_languages("cxx17")
     set_warnings("all", "error")
 
@@ -55,4 +56,4 @@ target("_infinilm")
     add_files("csrc/**.cc")
 
     set_installdir("python/infinilm")
-target_end()
+target_end()
\ No newline at end of file