diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index c6b01d5aa..1b5d734fd 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -52,6 +52,7 @@
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.softmax import softmax
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
 from infinicore.tensor import (
@@ -121,6 +122,7 @@
     "squeeze",
     "unsqueeze",
     "rearrange",
+    "softmax",
     "empty",
     "empty_like",
     "from_blob",
diff --git a/python/infinicore/ops/softmax.py b/python/infinicore/ops/softmax.py
new file mode 100644
index 000000000..50333043d
--- /dev/null
+++ b/python/infinicore/ops/softmax.py
@@ -0,0 +1,38 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def softmax(input: Tensor, dim: int, dtype=None, *, out=None) -> Tensor:
+    r"""Apply the softmax function over a given dimension."""
+
+    if dim is None:
+        raise TypeError("softmax() missing required argument: 'dim'")
+
+    if not infinicore.use_ntops or input.device.type not in ("cuda", "musa"):
+        raise RuntimeError("softmax is currently only available with ntops on CUDA/MUSA devices")
+
+    if out is None:
+        target_dtype = dtype if dtype is not None else input.dtype
+        return infinicore.ntops.torch.softmax(input, dim, dtype=target_dtype)
+
+    if not isinstance(out, Tensor):
+        raise TypeError(f"out must be a Tensor, got {type(out).__name__}")
+
+    if out.shape != input.shape:
+        raise ValueError("out tensor must have the same shape as input")
+
+    if out.device != input.device:
+        raise ValueError("out tensor must be on the same device as input")
+
+    target_dtype = dtype if dtype is not None else out.dtype
+
+    if dtype is not None and out.dtype != target_dtype:
+        raise TypeError("out tensor dtype must match the dtype argument")
+
+    # Reuse the cached ntops kernel to write directly into the provided output tensor.
+    from infinicore.ntops.torch.utils import _cached_make
+
+    kernel = _cached_make(infinicore.ntops.kernels.softmax.premake, input.ndim, dim)
+    kernel(input, out)
+
+    return out
diff --git a/test/infinicore/ops/softmax.py b/test/infinicore/ops/softmax.py
new file mode 100644
index 000000000..38bbae1e7
--- /dev/null
+++ b/test/infinicore/ops/softmax.py
@@ -0,0 +1,86 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+)
+
+# Test cases format: (in_shape, in_strides_or_None, dim_or_None)
+
+_TEST_CASES_DATA = [
+    ((4, 10), None, -1),
+    ((2, 5, 8), (40, 8, 1), 1),
+    ((8, 20), None, 1),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """soft max(input, dim=None, dtype=None)"""
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        in_strides = data[1] if len(data) > 1 else None
+        dim = data[2] if len(data) > 2 else -1
+
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            input_spec = TensorSpec.from_tensor(shape, in_strides, dtype)
+
+            kwargs = {"dim": dim}
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=kwargs,
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"softmax - OUT_OF_PLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """softmax operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("softmax")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.nn.functional.softmax(*args, **kwargs)
+
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.softmax(*args, **kwargs)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testop.py b/testop.py
new file mode 100644
index 000000000..e46d4fa61
--- /dev/null
+++ b/testop.py
@@ -0,0 +1,11 @@
+import infinicore as ic
+
+device = ic.device("cuda:0")
+
+q = ic.empty((1, 1, 4), dtype=ic.float16, device=device)
+
+print(q.info)
+
+q = ic.softmax(q, dim=-1)
+
+print(q)
diff --git a/third_party/spdlog b/third_party/spdlog
index f1d748e5e..3f03542d2 160000
--- a/third_party/spdlog
+++ b/third_party/spdlog
@@ -1 +1 @@
-Subproject commit f1d748e5e3edfa4b1778edea003bac94781bc7b7
+Subproject commit 3f03542d2eb4952e3b279d9cad9098d370b7be57