From 2ae3b59c97464cfccdaf2d69d3385c9fcd7aad49 Mon Sep 17 00:00:00 2001 From: pppoint <1024879159@qq.com> Date: Wed, 7 Jan 2026 16:58:58 +0800 Subject: [PATCH] =?UTF-8?q?Finish=20T1-1-4:=20sum=E3=80=81topk=E3=80=81var?= =?UTF-8?q?=E3=80=81var=5Fmean=E3=80=81all?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infinicore/ops/all.hpp | 28 ++ include/infinicore/ops/sum.hpp | 28 ++ include/infinicore/ops/topk.hpp | 18 ++ include/infinicore/ops/var.hpp | 28 ++ include/infinicore/ops/var_mean.hpp | 29 ++ python/infinicore/__init__.py | 10 + python/infinicore/ops/all.py | 57 ++++ python/infinicore/ops/sum.py | 38 +++ python/infinicore/ops/topk.py | 39 +++ python/infinicore/ops/var.py | 58 ++++ python/infinicore/ops/var_mean.py | 75 +++++ src/infinicore/ops/all/all.cc | 66 +++++ src/infinicore/ops/all/all_cpu.cc | 245 ++++++++++++++++ src/infinicore/ops/sum/sum.cc | 67 +++++ src/infinicore/ops/sum/sum_cpu.cc | 272 ++++++++++++++++++ src/infinicore/ops/topk/topk.cc | 46 +++ src/infinicore/ops/topk/topk_cpu.cc | 148 ++++++++++ src/infinicore/ops/var/var.cc | 66 +++++ src/infinicore/ops/var/var_cpu.cc | 224 +++++++++++++++ src/infinicore/ops/var_mean/var_mean.cc | 69 +++++ src/infinicore/ops/var_mean/var_mean_cpu.cc | 296 ++++++++++++++++++++ src/infinicore/pybind11/ops.hpp | 10 + src/infinicore/pybind11/ops/all.hpp | 37 +++ src/infinicore/pybind11/ops/sum.hpp | 38 +++ src/infinicore/pybind11/ops/topk.hpp | 32 +++ src/infinicore/pybind11/ops/var.hpp | 41 +++ src/infinicore/pybind11/ops/var_mean.hpp | 43 +++ test/infinicore/ops/all.py | 6 +- test/infinicore/ops/sum.py | 6 +- test/infinicore/ops/topk.py | 6 +- test/infinicore/ops/var.py | 6 +- test/infinicore/ops/var_mean.py | 6 +- 32 files changed, 2123 insertions(+), 15 deletions(-) create mode 100644 include/infinicore/ops/all.hpp create mode 100644 include/infinicore/ops/sum.hpp create mode 100644 include/infinicore/ops/topk.hpp create mode 100644 include/infinicore/ops/var.hpp create mode 100644 include/infinicore/ops/var_mean.hpp create mode 100644 python/infinicore/ops/all.py create mode 100644 python/infinicore/ops/sum.py create mode 100644 python/infinicore/ops/topk.py create mode 100644 python/infinicore/ops/var.py create mode 100644 python/infinicore/ops/var_mean.py create mode 100644 src/infinicore/ops/all/all.cc create mode 100644 src/infinicore/ops/all/all_cpu.cc create mode 100644 src/infinicore/ops/sum/sum.cc create mode 100644 src/infinicore/ops/sum/sum_cpu.cc create mode 100644 src/infinicore/ops/topk/topk.cc create mode 100644 src/infinicore/ops/topk/topk_cpu.cc create mode 100644 src/infinicore/ops/var/var.cc create mode 100644 src/infinicore/ops/var/var_cpu.cc create mode 100644 src/infinicore/ops/var_mean/var_mean.cc create mode 100644 src/infinicore/ops/var_mean/var_mean_cpu.cc create mode 100644 src/infinicore/pybind11/ops/all.hpp create mode 100644 src/infinicore/pybind11/ops/sum.hpp create mode 100644 src/infinicore/pybind11/ops/topk.hpp create mode 100644 src/infinicore/pybind11/ops/var.hpp create mode 100644 src/infinicore/pybind11/ops/var_mean.hpp diff --git a/include/infinicore/ops/all.hpp b/include/infinicore/ops/all.hpp new file mode 100644 index 000000000..be2a37eb0 --- /dev/null +++ b/include/infinicore/ops/all.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class AllGlobal { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor input, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor all_global(Tensor input); +void all_global_(Tensor input, Tensor output); + +class AllReduce { +public: + using schema = void (*)(Tensor, Tensor, int, bool); + static void execute(Tensor input, Tensor output, int dim, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +Tensor all_reduce(Tensor input, int dim, bool keepdim); +void all_reduce_(Tensor input, Tensor output, int dim, bool keepdim); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/sum.hpp b/include/infinicore/ops/sum.hpp new file mode 100644 index 000000000..cfb081b2a --- /dev/null +++ b/include/infinicore/ops/sum.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class SumGlobal { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor input, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor sum_global(Tensor input); +void sum_global_(Tensor input, Tensor output); + +class SumReduce { +public: + using schema = void (*)(Tensor, Tensor, int, bool); + static void execute(Tensor input, Tensor output, int dim, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +Tensor sum_reduce(Tensor input, int dim, bool keepdim); +void sum_reduce_(Tensor input, Tensor output, int dim, bool keepdim); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/topk.hpp b/include/infinicore/ops/topk.hpp new file mode 100644 index 000000000..dea1426b0 --- /dev/null +++ b/include/infinicore/ops/topk.hpp @@ -0,0 +1,18 @@ +#pragma once +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +class TopK { +public: + using schema = void (*)(Tensor, Tensor, Tensor, int, int, bool, bool); + static void execute(Tensor input, Tensor values, Tensor indices, int k, int dim, bool largest, bool sorted); + static common::OpDispatcher &dispatcher(); +}; + +std::tuple topk(Tensor input, int k, int dim, bool largest, bool sorted); +void topk_(Tensor input, Tensor values, Tensor indices, int k, int dim, bool largest, bool sorted); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/var.hpp b/include/infinicore/ops/var.hpp new file mode 100644 index 000000000..f431c873a --- /dev/null +++ b/include/infinicore/ops/var.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class VarGlobal { +public: + using schema = void (*)(Tensor, Tensor, int); + static void execute(Tensor input, Tensor output, int correction); + static common::OpDispatcher &dispatcher(); +}; + +Tensor var_global(Tensor input, int correction); +void var_global_(Tensor input, Tensor output, int correction); + +class VarReduce { +public: + using schema = void (*)(Tensor, Tensor, int, int, bool); + static void execute(Tensor input, Tensor output, int dim, int correction, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +Tensor var_reduce(Tensor input, int dim, int correction, bool keepdim); +void var_reduce_(Tensor input, Tensor output, int dim, int correction, bool keepdim); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/var_mean.hpp b/include/infinicore/ops/var_mean.hpp new file mode 100644 index 000000000..ab02457d6 --- /dev/null +++ b/include/infinicore/ops/var_mean.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +class VarMeanGlobal { +public: + using schema = void (*)(Tensor, Tensor, Tensor, int); + static void execute(Tensor input, Tensor out_var, Tensor out_mean, int correction); + static common::OpDispatcher &dispatcher(); +}; + +std::tuple var_mean_global(Tensor input, int correction); +void var_mean_global_(Tensor input, Tensor out_var, Tensor out_mean, int correction); + +class VarMeanReduce { +public: + using schema = void (*)(Tensor, Tensor, Tensor, int, int, bool); + static void execute(Tensor input, Tensor out_var, Tensor out_mean, int dim, int correction, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +std::tuple var_mean_reduce(Tensor input, int dim, int correction, bool keepdim); +void var_mean_reduce_(Tensor input, Tensor out_var, Tensor out_mean, int dim, int correction, bool keepdim); + +} // namespace infinicore::op \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 7ca962449..bd86093e8 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -40,13 +40,18 @@ uint8, ) from infinicore.ops.add import add +from infinicore.ops.all import all from infinicore.ops.attention import attention from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze +from infinicore.ops.sum import sum +from infinicore.ops.topk import topk from infinicore.ops.unsqueeze import unsqueeze +from infinicore.ops.var import var +from infinicore.ops.var_mean import var_mean from infinicore.tensor import ( Tensor, empty, @@ -119,6 +124,11 @@ "strided_empty", "strided_from_blob", "zeros", + "sum", + "topk", + "var", + "var_mean", + "all", ] use_ntops = False diff --git a/python/infinicore/ops/all.py b/python/infinicore/ops/all.py new file mode 100644 index 000000000..338b6eeba --- /dev/null +++ b/python/infinicore/ops/all.py @@ -0,0 +1,57 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def all( + input: Tensor, + dim: int | tuple[int] | list[int] | None = None, + keepdim: bool = False, + *, + out=None, +) -> Tensor: + r"""Computes the logical AND of all elements.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.all(input, dim=dim, keepdim=keepdim, out=out) + + if dim is None: + if out is None: + return Tensor(_infinicore.all_global(input._underlying)) + _infinicore.all_global_(input._underlying, out._underlying) + return out + + else: + if isinstance(dim, int): + dims = [dim] + else: + dims = list(dim) + + ndim = input.ndim + normalized_dims = sorted( + [d if d >= 0 else d + ndim for d in dims], reverse=True + ) + + current_input = input + + if len(normalized_dims) == 1 and out is not None: + _infinicore.all_reduce_( + current_input._underlying, out._underlying, normalized_dims[0], keepdim + ) + return out + + for i, target_dim in enumerate(normalized_dims): + is_last_step = i == len(normalized_dims) - 1 + + if is_last_step and out is not None: + _infinicore.all_reduce_( + current_input._underlying, out._underlying, target_dim, keepdim + ) + return out + else: + res_ptr = _infinicore.all_reduce( + current_input._underlying, target_dim, keepdim + ) + current_input = Tensor(res_ptr) + + return current_input diff --git a/python/infinicore/ops/sum.py b/python/infinicore/ops/sum.py new file mode 100644 index 000000000..241683b09 --- /dev/null +++ b/python/infinicore/ops/sum.py @@ -0,0 +1,38 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def sum( + input: Tensor, + dim: int | tuple[int] | list[int] | None = None, + keepdim=False, + *, + dtype=None, + out=None, +) -> Tensor: + r"""Apply the sum function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.sum( + input, dim, keepdim=keepdim, dtype=dtype, out=out + ) + + if dim is None: + if out is None: + return Tensor(_infinicore.sum_global(input._underlying)) + _infinicore.sum_global_(input._underlying, out._underlying) + return out + + else: + target_dim = dim + if isinstance(target_dim, (tuple, list)): + if len(target_dim) == 1: + target_dim = target_dim[0] + if out is None: + return Tensor( + _infinicore.sum_reduce(input._underlying, target_dim, keepdim) + ) + + _infinicore.sum_reduce_(input._underlying, out._underlying, target_dim, keepdim) + return out diff --git a/python/infinicore/ops/topk.py b/python/infinicore/ops/topk.py new file mode 100644 index 000000000..8f2f0ba99 --- /dev/null +++ b/python/infinicore/ops/topk.py @@ -0,0 +1,39 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def topk( + input: Tensor, + k: int, + dim: int = -1, + largest: bool = True, + sorted: bool = True, + *, + out=None, +): + r"""Returns the k largest elements of the given input tensor along a given dimension.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.topk(input, k, dim, largest, sorted, out=out) + + if out is None: + res_values, res_indices = _infinicore.topk( + input._underlying, k, dim, largest, sorted + ) + return Tensor(res_values), Tensor(res_indices) + else: + if not isinstance(out, (tuple, list)) or len(out) != 2: + raise ValueError("out argument must be a tuple of (values, indices)") + + out_values, out_indices = out + _infinicore.topk_( + input._underlying, + out_values._underlying, + out_indices._underlying, + k, + dim, + largest, + sorted, + ) + return out_values, out_indices diff --git a/python/infinicore/ops/var.py b/python/infinicore/ops/var.py new file mode 100644 index 000000000..413b47dd5 --- /dev/null +++ b/python/infinicore/ops/var.py @@ -0,0 +1,58 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def var( + input: Tensor, + dim: int | tuple[int] | list[int] | None = None, + unbiased: bool | None = None, + correction: int | None = None, + keepdim: bool = False, + *, + dtype=None, + out=None, +) -> Tensor: + r"""Returns the variance of the input tensor.""" + + if unbiased is not None: + if correction is not None and correction != (1 if unbiased else 0): + raise ValueError( + "Cannot specify both 'unbiased' and 'correction' with conflicting values." + ) + final_correction = 1 if unbiased else 0 + else: + final_correction = correction if correction is not None else 1 + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.var( + input, + dim=dim, + correction=final_correction, + keepdim=keepdim, + dtype=dtype, + out=out, + ) + + if dim is None: + if out is None: + return Tensor(_infinicore.var_global(input._underlying, final_correction)) + _infinicore.var_global_(input._underlying, out._underlying, final_correction) + return out + else: + target_dim = dim + if isinstance(target_dim, (tuple, list)): + if len(target_dim) == 1: + target_dim = target_dim[0] + + if out is None: + return Tensor( + _infinicore.var_reduce( + input._underlying, target_dim, final_correction, keepdim + ) + ) + + _infinicore.var_reduce_( + input._underlying, out._underlying, target_dim, final_correction, keepdim + ) + return out diff --git a/python/infinicore/ops/var_mean.py b/python/infinicore/ops/var_mean.py new file mode 100644 index 000000000..da7ede9bf --- /dev/null +++ b/python/infinicore/ops/var_mean.py @@ -0,0 +1,75 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def var_mean( + input: Tensor, + dim: int | tuple[int] | list[int] | None = None, + unbiased: bool | None = None, + correction: int | None = None, + keepdim: bool = False, + *, + dtype=None, + out=None, +) -> tuple[Tensor, Tensor]: + r"""Calculates the variance and mean of input tensor.""" + + if unbiased is not None: + if correction is not None and correction != (1 if unbiased else 0): + raise ValueError( + "Cannot specify both 'unbiased' and 'correction' with conflicting values." + ) + final_correction = 1 if unbiased else 0 + else: + final_correction = correction if correction is not None else 1 + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.var_mean( + input, + dim=dim, + correction=final_correction, + keepdim=keepdim, + dtype=dtype, + out=out, + ) + + if dim is None: + if out is None: + v_tensor, m_tensor = _infinicore.var_mean_global( + input._underlying, final_correction + ) + return Tensor(v_tensor), Tensor(m_tensor) + + if not isinstance(out, (list, tuple)) or len(out) < 2: + raise ValueError("out must be a tuple/list of two Tensors for var_mean") + + _infinicore.var_mean_global_( + input._underlying, out[0]._underlying, out[1]._underlying, final_correction + ) + return out[0], out[1] + + else: + target_dim = dim + if isinstance(target_dim, (tuple, list)): + if len(target_dim) == 1: + target_dim = target_dim[0] + + if out is None: + v_tensor, m_tensor = _infinicore.var_mean_reduce( + input._underlying, target_dim, final_correction, keepdim + ) + return Tensor(v_tensor), Tensor(m_tensor) + + if not isinstance(out, (list, tuple)) or len(out) < 2: + raise ValueError("out must be a tuple/list of two Tensors for var_mean") + + _infinicore.var_mean_reduce_( + input._underlying, + out[0]._underlying, + out[1]._underlying, + target_dim, + final_correction, + keepdim, + ) + return out[0], out[1] diff --git a/src/infinicore/ops/all/all.cc b/src/infinicore/ops/all/all.cc new file mode 100644 index 000000000..a3d359c06 --- /dev/null +++ b/src/infinicore/ops/all/all.cc @@ -0,0 +1,66 @@ +#include "infinicore/ops/all.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &AllGlobal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void AllGlobal::execute(Tensor input, Tensor output) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output); +} + +Tensor all_global(Tensor input) { + Shape shape = Shape(); + auto output = Tensor::empty(shape, DataType::BOOL, input->device()); + all_global_(input, output); + return output; +} + +void all_global_(Tensor input, Tensor output) { + AllGlobal::execute(input, output); +} + +common::OpDispatcher &AllReduce::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void AllReduce::execute(Tensor input, Tensor output, int dim, bool keepdim) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output, dim, keepdim); +} + +Tensor all_reduce(Tensor input, int dim, bool keepdim) { + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto output = Tensor::empty(output_shape, DataType::BOOL, input->device()); + all_reduce_(input, output, dim, keepdim); + return output; +} + +void all_reduce_(Tensor input, Tensor output, int dim, bool keepdim) { + AllReduce::execute(input, output, dim, keepdim); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/all/all_cpu.cc b/src/infinicore/ops/all/all_cpu.cc new file mode 100644 index 000000000..cdc723eb7 --- /dev/null +++ b/src/infinicore/ops/all/all_cpu.cc @@ -0,0 +1,245 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/all.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::all_impl::cpu { + +template +inline bool is_false_val(T val) { + if constexpr (std::is_same_v || std::is_same_v) { + return utils::cast(val) == 0.0f; + } else { + return !static_cast(val); + } +} + +template +void all_global_kernel(const T *input_base, uint8_t *output_ptr, + const std::vector &shape, + const std::vector &strides, + size_t numel) { + size_t ndim = shape.size(); + int global_result = 1; + +#pragma omp parallel for reduction(min : global_result) + for (size_t i = 0; i < numel; ++i) { + if (global_result == 0) { + continue; + } + + size_t temp = i; + int64_t current_offset = 0; + for (int d = ndim - 1; d >= 0; --d) { + size_t dim_sz = shape[d]; + size_t coord = temp % dim_sz; + temp /= dim_sz; + current_offset += coord * strides[d]; + } + + if (is_false_val(input_base[current_offset])) { + global_result = 0; + } + } + *output_ptr = static_cast(global_result); +} + +template +void all_global_kernel_contiguous_fast(const T *input_base, uint8_t *output_ptr, size_t numel) { + int global_result = 1; +#pragma omp parallel for reduction(min : global_result) + for (size_t i = 0; i < numel; ++i) { + if (global_result == 0) { + continue; + } + if (is_false_val(input_base[i])) { + global_result = 0; + } + } + *output_ptr = static_cast(global_result); +} + +void calculate_global(Tensor input, Tensor output) { + bool is_contiguous = input->is_contiguous(); + if (!is_contiguous) { + auto strides = input->strides(); + auto shape = input->shape(); + int ndim = input->ndim(); + size_t expected = 1; + is_contiguous = true; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected && shape[i] > 1) { + is_contiguous = false; + break; + } + expected *= shape[i]; + } + } + + size_t numel = input->numel(); + auto input_base = input->data(); + auto output_base = reinterpret_cast(output->data()); + auto dtype = input->dtype(); + + auto dispatch = [&](auto dummy) { + using T = decltype(dummy); + if (is_contiguous) { + all_global_kernel_contiguous_fast( + reinterpret_cast(input_base), output_base, numel); + } else { + all_global_kernel( + reinterpret_cast(input_base), output_base, input->shape(), input->strides(), numel); + } + }; + + if (dtype == DataType::F32) { + dispatch(float{}); + } else if (dtype == DataType::F64) { + dispatch(double{}); + } else if (dtype == DataType::F16) { + dispatch(fp16_t{}); + } else if (dtype == DataType::BF16) { + dispatch(bf16_t{}); + } else if (dtype == DataType::BOOL || dtype == DataType::U8) { + dispatch(uint8_t{}); + } else if (dtype == DataType::I32) { + dispatch(int32_t{}); + } else if (dtype == DataType::I64) { + dispatch(int64_t{}); + } else { + throw std::runtime_error("Unsupported dtype for CPU all (Global)."); + } +} + +template +void all_reduce_kernel(const T *input_base, uint8_t *output_base, + const std::vector &input_shape, + const std::vector &input_strides, + const std::vector &output_strides, + int dim, + bool keepdim) { + + size_t ndim = input_shape.size(); + size_t dim_size = input_shape[dim]; + int64_t dim_stride = input_strides[dim]; + + std::vector logical_out_shape; + std::vector out_to_in_strides; + std::vector out_to_out_strides; + + size_t output_numel = 1; + for (size_t i = 0; i < ndim; ++i) { + if (static_cast(i) != dim) { + logical_out_shape.push_back(input_shape[i]); + output_numel *= input_shape[i]; + out_to_in_strides.push_back(input_strides[i]); + if (keepdim) { + out_to_out_strides.push_back(output_strides[i]); + } + } + } + if (!keepdim) { + out_to_out_strides = output_strides; + } + + std::vector temp_results(output_numel); + +#pragma omp parallel for + for (size_t i = 0; i < output_numel; ++i) { + size_t temp = i; + int64_t input_offset_base = 0; + + for (int d = static_cast(logical_out_shape.size()) - 1; d >= 0; --d) { + size_t size = logical_out_shape[d]; + size_t coord = temp % size; + temp /= size; + input_offset_base += coord * out_to_in_strides[d]; + } + + int row_result = 1; + for (size_t j = 0; j < dim_size; ++j) { + int64_t offset = input_offset_base + j * dim_stride; + if (is_false_val(input_base[offset])) { + row_result = 0; + break; + } + } + temp_results[i] = static_cast(row_result); + } + + for (size_t i = 0; i < output_numel; ++i) { + size_t temp = i; + int64_t output_offset = 0; + + for (int d = static_cast(logical_out_shape.size()) - 1; d >= 0; --d) { + size_t size = logical_out_shape[d]; + size_t coord = temp % size; + temp /= size; + output_offset += coord * out_to_out_strides[d]; + } + + output_base[output_offset] = temp_results[i]; + } +} + +void calculate_reduce(Tensor input, Tensor output, int dim, bool keepdim) { + auto ndim = input->ndim(); + if (dim < 0) { + dim = ndim + dim; + } + + auto input_shape = input->shape(); + auto input_strides = input->strides(); + auto output_strides = output->strides(); + + auto input_base = input->data(); + auto output_base = reinterpret_cast(output->data()); + auto dtype = input->dtype(); + + auto dispatch = [&](auto dummy) { + using T = decltype(dummy); + + all_reduce_kernel( + reinterpret_cast(input_base), + output_base, + input_shape, + input_strides, + output_strides, + dim, + keepdim); + }; + + if (dtype == DataType::F32) { + dispatch(float{}); + } else if (dtype == DataType::F64) { + dispatch(double{}); + } else if (dtype == DataType::F16) { + dispatch(fp16_t{}); + } else if (dtype == DataType::BF16) { + dispatch(bf16_t{}); + } else if (dtype == DataType::BOOL || dtype == DataType::U8) { + dispatch(uint8_t{}); + } else if (dtype == DataType::I32) { + dispatch(int32_t{}); + } else if (dtype == DataType::I64) { + dispatch(int64_t{}); + } else { + throw std::runtime_error("Unsupported dtype for CPU all (Reduce)."); + } +} + +static bool registered_global = []() { + AllGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate_global); + return true; +}(); + +static bool registered_reduce = []() { + AllReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate_reduce); + return true; +}(); + +} // namespace infinicore::op::all_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/sum/sum.cc b/src/infinicore/ops/sum/sum.cc new file mode 100644 index 000000000..1ee789c14 --- /dev/null +++ b/src/infinicore/ops/sum/sum.cc @@ -0,0 +1,67 @@ +#include "infinicore/ops/sum.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &SumGlobal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void SumGlobal::execute(Tensor input, Tensor output) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output); +} + +Tensor sum_global(Tensor input) { + Shape shape = Shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + sum_global_(input, output); + return output; +} + +void sum_global_(Tensor input, Tensor output) { + SumGlobal::execute(input, output); +} + +common::OpDispatcher &SumReduce::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void SumReduce::execute(Tensor input, Tensor output, int dim, bool keepdim) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output, dim, keepdim); +} + +Tensor sum_reduce(Tensor input, int dim, bool keepdim) { + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + sum_reduce_(input, output, dim, keepdim); + return output; +} + +void sum_reduce_(Tensor input, Tensor output, int dim, bool keepdim) { + SumReduce::execute(input, output, dim, keepdim); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/sum/sum_cpu.cc b/src/infinicore/ops/sum/sum_cpu.cc new file mode 100644 index 000000000..79cbf0edb --- /dev/null +++ b/src/infinicore/ops/sum/sum_cpu.cc @@ -0,0 +1,272 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/sum.hpp" +#include +#include + +namespace infinicore::op::sum_impl::cpu { + +template +void sum_global_kernel(const T *input_ptr, T *output_ptr, size_t numel) { + AccT total_sum = 0; + +#pragma omp parallel for reduction(+ : total_sum) + for (size_t i = 0; i < numel; ++i) { + total_sum += utils::cast(input_ptr[i]); + } + + *output_ptr = utils::cast(total_sum); +} + +void calculate_global(Tensor input, Tensor output) { + bool is_contiguous = true; + auto strides = input->strides(); + auto shape = input->shape(); + auto ndim = input->ndim(); + size_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected_stride && shape[i] > 1) { + is_contiguous = false; + break; + } + expected_stride *= shape[i]; + } + + auto dtype = input->dtype(); + size_t numel = input->numel(); + auto output_base = output->data(); + auto input_base = input->data(); + + if (is_contiguous) { + if (dtype == DataType::F32) { + sum_global_kernel( + reinterpret_cast(input_base), + reinterpret_cast(output_base), numel); + } else if (dtype == DataType::F64) { + sum_global_kernel( + reinterpret_cast(input_base), + reinterpret_cast(output_base), numel); + } else if (dtype == DataType::F16) { + sum_global_kernel( + reinterpret_cast(input_base), + reinterpret_cast(output_base), numel); + } else { + throw std::runtime_error("Unsupported dtype."); + } + } else { + if (dtype == DataType::F16) { + float total_sum = 0; + std::vector indices(ndim, 0); + auto *ptr_base = reinterpret_cast(input_base); + + for (size_t i = 0; i < numel; ++i) { + size_t offset = 0; + for (int d = 0; d < ndim; ++d) { + offset += indices[d] * strides[d]; + } + total_sum += utils::cast(ptr_base[offset]); + + for (int d = ndim - 1; d >= 0; --d) { + indices[d]++; + if (indices[d] < shape[d]) { + break; + } + indices[d] = 0; + } + } + *reinterpret_cast(output_base) = utils::cast(total_sum); + } else { + float total_sum = 0; + std::vector indices(ndim, 0); + + for (size_t i = 0; i < numel; ++i) { + size_t offset = 0; + for (int d = 0; d < ndim; ++d) { + offset += indices[d] * strides[d]; + } + + if (dtype == DataType::F32) { + total_sum += reinterpret_cast(input_base)[offset]; + } else if (dtype == DataType::F64) { + total_sum += reinterpret_cast(input_base)[offset]; + } + + for (int d = ndim - 1; d >= 0; --d) { + indices[d]++; + if (indices[d] < shape[d]) { + break; + } + indices[d] = 0; + } + } + if (dtype == DataType::F32) { + *reinterpret_cast(output_base) = total_sum; + } else if (dtype == DataType::F64) { + *reinterpret_cast(output_base) = total_sum; + } + } + } +} + +template +void sum_reduce_contiguous(const T *input_data, T *output_data, + const std::vector &shape, + int dim, size_t numel) { + int ndim = shape.size(); + + size_t dim_size = shape[dim]; + size_t outer_size = 1; + size_t inner_size = 1; + + for (int i = 0; i < dim; ++i) { + outer_size *= shape[i]; + } + for (int i = dim + 1; i < ndim; ++i) { + inner_size *= shape[i]; + } + + if (inner_size == 1) { +#pragma omp parallel for + for (size_t i = 0; i < outer_size; ++i) { + const T *row_ptr = input_data + i * dim_size; + AccT sum = 0; + for (size_t j = 0; j < dim_size; ++j) { + sum += utils::cast(row_ptr[j]); + } + output_data[i] = utils::cast(sum); + } + return; + } + + size_t output_numel = outer_size * inner_size; + +#pragma omp parallel for + for (size_t o = 0; o < outer_size; ++o) { + size_t input_base_offset = o * dim_size * inner_size; + size_t output_base_offset = o * inner_size; + + for (size_t i = 0; i < inner_size; ++i) { + AccT sum = 0; + size_t col_offset = input_base_offset + i; + + for (size_t d = 0; d < dim_size; ++d) { + sum += utils::cast(input_data[col_offset + d * inner_size]); + } + output_data[output_base_offset + i] = utils::cast(sum); + } + } +} + +template +void sum_reduce_strided(const T *input_base, T *output_base, + const std::vector &input_shape, + const std::vector &input_strides, + const std::vector &output_shape, + int dim) { + + size_t output_numel = 1; + for (auto s : output_shape) { + output_numel *= s; + } + + size_t dim_size = input_shape[dim]; + int64_t dim_stride = input_strides[dim]; + int ndim = input_shape.size(); + +#pragma omp parallel for + for (size_t out_idx = 0; out_idx < output_numel; ++out_idx) { + size_t temp_idx = out_idx; + size_t input_offset_base = 0; + + for (int i = ndim - 1; i >= 0; --i) { + if (i == dim) { + continue; + } + size_t coord = temp_idx % input_shape[i]; + temp_idx /= input_shape[i]; + input_offset_base += coord * input_strides[i]; + } + + AccT sum = 0; + for (size_t d = 0; d < dim_size; ++d) { + const T *ptr = input_base + (input_offset_base + d * dim_stride); + sum += utils::cast(*ptr); + } + output_base[out_idx] = utils::cast(sum); + } +} + +void calculate_reduce(Tensor input, Tensor output, int dim, bool keepdim) { + auto ndim = input->ndim(); + if (dim < 0) { + dim = ndim + dim; + } + + auto dtype = input->dtype(); + + bool is_contiguous = true; + auto strides = input->strides(); + auto shape = input->shape(); + size_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected_stride) { + is_contiguous = false; + if (shape[i] > 1) { + break; + } + } + expected_stride *= shape[i]; + } + + if (dtype == DataType::F32) { + if (is_contiguous) { + sum_reduce_contiguous( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, dim, input->numel()); + } else { + sum_reduce_strided( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, strides, output->shape(), dim); + } + } else if (dtype == DataType::F64) { + if (is_contiguous) { + sum_reduce_contiguous( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, dim, input->numel()); + } else { + sum_reduce_strided( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, strides, output->shape(), dim); + } + } else if (dtype == DataType::F16) { + if (is_contiguous) { + sum_reduce_contiguous( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, dim, input->numel()); + } else { + sum_reduce_strided( + reinterpret_cast(input->data()), + reinterpret_cast(output->data()), + shape, strides, output->shape(), dim); + } + } else { + throw std::runtime_error("Unsupported data type for sum reduce."); + } +} + +static bool registered_global = []() { + SumGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate_global); + return true; +}(); + +static bool registered_reduce = []() { + SumReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate_reduce); + return true; +}(); + +} // namespace infinicore::op::sum_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/topk/topk.cc b/src/infinicore/ops/topk/topk.cc new file mode 100644 index 000000000..133ae8c20 --- /dev/null +++ b/src/infinicore/ops/topk/topk.cc @@ -0,0 +1,46 @@ +#include "infinicore/ops/topk.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &TopK::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void TopK::execute(Tensor input, Tensor values, Tensor indices, int k, int dim, bool largest, bool sorted) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, values, indices, k, dim, largest, sorted); +} + +std::tuple topk(Tensor input, int k, int dim, bool largest, bool sorted) { + + int ndim = input->ndim(); + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = ndim + normalized_dim; + } + + Shape output_shape = input->shape(); + + if (k > output_shape[normalized_dim]) { + throw std::runtime_error("k cannot be larger than the size of the dimension."); + } + output_shape[normalized_dim] = k; + + auto values = Tensor::empty(output_shape, input->dtype(), input->device()); + auto indices = Tensor::empty(output_shape, DataType::I64, input->device()); + + topk_(input, values, indices, k, normalized_dim, largest, sorted); + return {values, indices}; +} + +void topk_(Tensor input, Tensor values, Tensor indices, int k, int dim, bool largest, bool sorted) { + int ndim = input->ndim(); + if (dim < 0) { + dim = ndim + dim; + } + TopK::execute(input, values, indices, k, dim, largest, sorted); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/topk/topk_cpu.cc b/src/infinicore/ops/topk/topk_cpu.cc new file mode 100644 index 000000000..b8b0d6e62 --- /dev/null +++ b/src/infinicore/ops/topk/topk_cpu.cc @@ -0,0 +1,148 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/topk.hpp" +#include +#include +#include +#include + +namespace infinicore::op::topk_impl::cpu { + +template +struct Element { + T value; + int64_t index; +}; + +template +void topk_kernel(const T *input_base, T *values_base, int64_t *indices_base, + const std::vector &input_shape, + const std::vector &input_strides, + const std::vector &output_shape, + const std::vector &values_strides, + const std::vector &indices_strides, + int k, int dim, bool largest, bool sorted) { + + size_t ndim = input_shape.size(); + size_t dim_size = input_shape[dim]; + int64_t dim_stride = input_strides[dim]; + + size_t num_rows = 1; + for (size_t i = 0; i < ndim; ++i) { + if (i != dim) { + num_rows *= input_shape[i]; + } + } + +#pragma omp parallel for + for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) { + size_t t = row_idx; + size_t current_inp_offset = 0; + size_t current_val_offset = 0; + size_t current_idx_offset = 0; + + for (int i = ndim - 1; i >= 0; --i) { + if (i == dim) { + continue; + } + + size_t size = input_shape[i]; + size_t coord = t % size; + t /= size; + + current_inp_offset += coord * input_strides[i]; + current_val_offset += coord * values_strides[i]; + current_idx_offset += coord * indices_strides[i]; + } + + std::vector> row_data; + row_data.reserve(dim_size); + for (size_t i = 0; i < dim_size; ++i) { + ValT val = utils::cast(input_base[current_inp_offset + i * dim_stride]); + row_data.push_back({val, static_cast(i)}); + } + + auto cmp = [largest](const Element &a, const Element &b) { + bool isnan_a = std::isnan(a.value); + bool isnan_b = std::isnan(b.value); + + if (isnan_a || isnan_b) { + if (isnan_a && isnan_b) { + return a.index < b.index; + } + + return largest ? isnan_a : !isnan_a; + } + + if (a.value != b.value) { + return largest ? (a.value > b.value) : (a.value < b.value); + } + + return a.index < b.index; + }; + + if (k < dim_size) { + + std::partial_sort(row_data.begin(), row_data.begin() + k, row_data.end(), cmp); + } else { + + std::sort(row_data.begin(), row_data.end(), cmp); + } + + int64_t out_val_dim_stride = values_strides[dim]; + int64_t out_idx_dim_stride = indices_strides[dim]; + + for (int i = 0; i < k; ++i) { + values_base[current_val_offset + i * out_val_dim_stride] = utils::cast(row_data[i].value); + indices_base[current_idx_offset + i * out_idx_dim_stride] = row_data[i].index; + } + } +} + +void calculate(Tensor input, Tensor values, Tensor indices, int k, int dim, bool largest, bool sorted) { + auto input_shape = input->shape(); + auto input_strides = input->strides(); + auto values_strides = values->strides(); + auto indices_strides = indices->strides(); + auto output_shape = values->shape(); + auto dtype = input->dtype(); + + if (dtype == DataType::F32) { + topk_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(values->data()), + reinterpret_cast(indices->data()), + input_shape, input_strides, output_shape, values_strides, indices_strides, + k, dim, largest, sorted); + } else if (dtype == DataType::F64) { + topk_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(values->data()), + reinterpret_cast(indices->data()), + input_shape, input_strides, output_shape, values_strides, indices_strides, + k, dim, largest, sorted); + } else if (dtype == DataType::F16) { + topk_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(values->data()), + reinterpret_cast(indices->data()), + input_shape, input_strides, output_shape, values_strides, indices_strides, + k, dim, largest, sorted); + } else if (dtype == DataType::BF16) { + topk_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(values->data()), + reinterpret_cast(indices->data()), + input_shape, input_strides, output_shape, values_strides, indices_strides, + k, dim, largest, sorted); + } else { + throw std::runtime_error("Unsupported data type for topk."); + } +} + +static bool registered = []() { + TopK::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::topk_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/var/var.cc b/src/infinicore/ops/var/var.cc new file mode 100644 index 000000000..a94c272aa --- /dev/null +++ b/src/infinicore/ops/var/var.cc @@ -0,0 +1,66 @@ +#include "infinicore/ops/var.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &VarGlobal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void VarGlobal::execute(Tensor input, Tensor output, int correction) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output, correction); +} + +Tensor var_global(Tensor input, int correction) { + Shape shape = Shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + var_global_(input, output, correction); + return output; +} + +void var_global_(Tensor input, Tensor output, int correction) { + VarGlobal::execute(input, output, correction); +} + +common::OpDispatcher &VarReduce::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void VarReduce::execute(Tensor input, Tensor output, int dim, int correction, bool keepdim) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, output, dim, correction, keepdim); +} + +Tensor var_reduce(Tensor input, int dim, int correction, bool keepdim) { + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + var_reduce_(input, output, dim, correction, keepdim); + return output; +} + +void var_reduce_(Tensor input, Tensor output, int dim, int correction, bool keepdim) { + VarReduce::execute(input, output, dim, correction, keepdim); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/var/var_cpu.cc b/src/infinicore/ops/var/var_cpu.cc new file mode 100644 index 000000000..516c651cf --- /dev/null +++ b/src/infinicore/ops/var/var_cpu.cc @@ -0,0 +1,224 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/var.hpp" +#include +#include +#include + +namespace infinicore::op::var_impl::cpu { + +template +void var_global_kernel(const T *input_ptr, T *output_ptr, size_t numel, int correction) { + + AccT sum = 0; +#pragma omp parallel for reduction(+ : sum) + for (size_t i = 0; i < numel; ++i) { + sum += utils::cast(input_ptr[i]); + } + AccT mean = sum / static_cast(numel); + + AccT sum_sq_diff = 0; +#pragma omp parallel for reduction(+ : sum_sq_diff) + for (size_t i = 0; i < numel; ++i) { + AccT val = utils::cast(input_ptr[i]); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + } + + AccT divisor = static_cast(numel) - static_cast(correction); + if (divisor <= 0) { + *output_ptr = utils::cast(NAN); + } else { + *output_ptr = utils::cast(sum_sq_diff / divisor); + } +} + +template +void var_global_strided(const T *input_base, T *output_ptr, + const std::vector &shape, + const std::vector &strides, + size_t numel, double correction) { + int ndim = shape.size(); + + auto get_val = [&](size_t linear_idx) -> AccT { + size_t temp = linear_idx; + int64_t offset = 0; + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp % shape[d]; + temp /= shape[d]; + offset += static_cast(coord) * strides[d]; + } + return utils::cast(input_base[offset]); + }; + + AccT sum = 0; +#pragma omp parallel for reduction(+ : sum) + for (size_t i = 0; i < numel; ++i) { + sum += get_val(i); + } + AccT mean = sum / static_cast(numel); + + AccT sum_sq_diff = 0; +#pragma omp parallel for reduction(+ : sum_sq_diff) + for (size_t i = 0; i < numel; ++i) { + AccT val = get_val(i); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + } + + AccT divisor = static_cast(numel) - static_cast(correction); + if (divisor <= 0) { + *output_ptr = utils::cast(NAN); + } else { + *output_ptr = utils::cast(sum_sq_diff / divisor); + } +} + +void calculate_global(Tensor input, Tensor output, int correction) { + bool is_contiguous = input->is_contiguous(); + + if (!is_contiguous) { + auto strides = input->strides(); + auto shape = input->shape(); + int ndim = input->ndim(); + size_t expected = 1; + is_contiguous = true; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected && shape[i] > 1) { + is_contiguous = false; + break; + } + expected *= shape[i]; + } + } + + auto dtype = input->dtype(); + if (dtype == DataType::F32) { + if (is_contiguous) { + var_global_kernel((float *)input->data(), (float *)output->data(), input->numel(), correction); + } else { + var_global_strided((float *)input->data(), (float *)output->data(), input->shape(), input->strides(), input->numel(), correction); + } + } else if (dtype == DataType::F16) { + if (is_contiguous) { + var_global_kernel((fp16_t *)input->data(), (fp16_t *)output->data(), input->numel(), correction); + } else { + var_global_strided((fp16_t *)input->data(), (fp16_t *)output->data(), input->shape(), input->strides(), input->numel(), correction); + } + } else if (dtype == DataType::BF16) { + if (is_contiguous) { + var_global_kernel((bf16_t *)input->data(), (bf16_t *)output->data(), input->numel(), correction); + } else { + var_global_strided((bf16_t *)input->data(), (bf16_t *)output->data(), input->shape(), input->strides(), input->numel(), correction); + } + } else { + throw std::runtime_error("Unsupported dtype"); + } +} + +template +void var_reduce_impl(const T *input_base, T *output_base, + const std::vector &input_shape, + const std::vector &input_strides, + const std::vector &output_shape, + int dim, int correction) { + + size_t output_numel = 1; + for (auto s : output_shape) { + output_numel *= s; + } + + size_t dim_size = input_shape[dim]; + int64_t dim_stride = input_strides[dim]; + int ndim = input_shape.size(); + + std::vector out_to_in_strides; + std::vector out_dims; + for (int i = 0; i < ndim; ++i) { + if (i != dim) { + out_dims.push_back(input_shape[i]); + out_to_in_strides.push_back(input_strides[i]); + } + } + +#pragma omp parallel for + for (size_t out_idx = 0; out_idx < output_numel; ++out_idx) { + size_t temp = out_idx; + int64_t input_offset_base = 0; + + for (int d = (int)out_dims.size() - 1; d >= 0; --d) { + size_t coord = temp % out_dims[d]; + temp /= out_dims[d]; + input_offset_base += static_cast(coord) * out_to_in_strides[d]; + } + + AccT sum = 0; + for (size_t k = 0; k < dim_size; ++k) { + const T *ptr = input_base + (input_offset_base + k * dim_stride); + sum += utils::cast(*ptr); + } + AccT mean = sum / static_cast(dim_size); + + AccT sum_sq_diff = 0; + for (size_t k = 0; k < dim_size; ++k) { + const T *ptr = input_base + (input_offset_base + k * dim_stride); + AccT val = utils::cast(*ptr); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + } + + AccT divisor = static_cast(dim_size) - static_cast(correction); + if (divisor <= 0) { + output_base[out_idx] = utils::cast(NAN); + } else { + output_base[out_idx] = utils::cast(sum_sq_diff / divisor); + } + } +} + +void calculate_reduce(Tensor input, Tensor output, int dim, int correction, bool keepdim) { + auto ndim = input->ndim(); + if (dim < 0) { + dim = ndim + dim; + } + + std::vector logical_out_shape; + for (int i = 0; i < ndim; ++i) { + if (i != dim) { + logical_out_shape.push_back(input->shape()[i]); + } + } + + if (logical_out_shape.empty()) { + logical_out_shape.push_back(1); + } + + auto dtype = input->dtype(); + if (dtype == DataType::F32) { + var_reduce_impl( + (float *)input->data(), (float *)output->data(), + input->shape(), input->strides(), logical_out_shape, dim, correction); + } else if (dtype == DataType::F16) { + var_reduce_impl( + (fp16_t *)input->data(), (fp16_t *)output->data(), + input->shape(), input->strides(), logical_out_shape, dim, correction); + } else if (dtype == DataType::BF16) { + var_reduce_impl( + (bf16_t *)input->data(), (bf16_t *)output->data(), + input->shape(), input->strides(), logical_out_shape, dim, correction); + } else { + throw std::runtime_error("Unsupported dtype"); + } +} + +static bool registered_global = []() { + VarGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate_global); + return true; +}(); + +static bool registered_reduce = []() { + VarReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate_reduce); + return true; +}(); + +} // namespace infinicore::op::var_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/var_mean/var_mean.cc b/src/infinicore/ops/var_mean/var_mean.cc new file mode 100644 index 000000000..d51ed83af --- /dev/null +++ b/src/infinicore/ops/var_mean/var_mean.cc @@ -0,0 +1,69 @@ +#include "infinicore/ops/var_mean.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &VarMeanGlobal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void VarMeanGlobal::execute(Tensor input, Tensor out_var, Tensor out_mean, int correction) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, out_var, out_mean, correction); +} + +std::tuple var_mean_global(Tensor input, int correction) { + Shape shape = Shape(); + auto out_var = Tensor::empty(shape, input->dtype(), input->device()); + auto out_mean = Tensor::empty(shape, input->dtype(), input->device()); + var_mean_global_(input, out_var, out_mean, correction); + return {out_var, out_mean}; +} + +void var_mean_global_(Tensor input, Tensor out_var, Tensor out_mean, int correction) { + VarMeanGlobal::execute(input, out_var, out_mean, correction); +} + +common::OpDispatcher &VarMeanReduce::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void VarMeanReduce::execute(Tensor input, Tensor out_var, Tensor out_mean, int dim, int correction, bool keepdim) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, out_var, out_mean, dim, correction, keepdim); +} + +std::tuple var_mean_reduce(Tensor input, int dim, int correction, bool keepdim) { + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto out_var = Tensor::empty(output_shape, input->dtype(), input->device()); + auto out_mean = Tensor::empty(output_shape, input->dtype(), input->device()); + + var_mean_reduce_(input, out_var, out_mean, dim, correction, keepdim); + return {out_var, out_mean}; +} + +void var_mean_reduce_(Tensor input, Tensor out_var, Tensor out_mean, int dim, int correction, bool keepdim) { + VarMeanReduce::execute(input, out_var, out_mean, dim, correction, keepdim); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/var_mean/var_mean_cpu.cc b/src/infinicore/ops/var_mean/var_mean_cpu.cc new file mode 100644 index 000000000..dfb210c1a --- /dev/null +++ b/src/infinicore/ops/var_mean/var_mean_cpu.cc @@ -0,0 +1,296 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/var_mean.hpp" +#include +#include +#include + +namespace infinicore::op::var_mean_impl::cpu { + +template +void var_mean_global_kernel(const T *input_ptr, T *out_var, T *out_mean, size_t numel, int correction) { + AccT sum = 0; +#pragma omp parallel for reduction(+ : sum) + for (size_t i = 0; i < numel; ++i) { + sum += utils::cast(input_ptr[i]); + } + + AccT mean = sum / numel; + AccT sum_sq_diff = 0; + +#pragma omp parallel for reduction(+ : sum_sq_diff) + for (size_t i = 0; i < numel; ++i) { + AccT val = utils::cast(input_ptr[i]); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + } + + AccT divisor = (numel > (size_t)correction) ? (AccT)(numel - correction) : 0; + AccT var = (divisor > 0) ? (sum_sq_diff / divisor) : NAN; + + *out_mean = utils::cast(mean); + *out_var = utils::cast(var); +} + +template +void var_mean_global_strided(const T *input_base, T *out_var, T *out_mean, + const std::vector &shape, + const std::vector &strides, + size_t numel, int correction) { + int ndim = shape.size(); + + AccT sum = 0; + + std::vector indices(ndim, 0); + for (size_t i = 0; i < numel; ++i) { + size_t offset = 0; + for (int d = 0; d < ndim; ++d) { + offset += indices[d] * strides[d]; + } + sum += utils::cast(input_base[offset]); + + for (int d = ndim - 1; d >= 0; --d) { + indices[d]++; + if (indices[d] < shape[d]) { + break; + } + indices[d] = 0; + } + } + AccT mean = sum / numel; + + AccT sum_sq_diff = 0; + std::fill(indices.begin(), indices.end(), 0); + for (size_t i = 0; i < numel; ++i) { + size_t offset = 0; + for (int d = 0; d < ndim; ++d) { + offset += indices[d] * strides[d]; + } + AccT val = utils::cast(input_base[offset]); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + + for (int d = ndim - 1; d >= 0; --d) { + indices[d]++; + if (indices[d] < shape[d]) { + break; + } + indices[d] = 0; + } + } + + AccT divisor = (numel > (size_t)correction) ? (AccT)(numel - correction) : 0; + AccT var = (divisor > 0) ? (sum_sq_diff / divisor) : NAN; + + *out_mean = utils::cast(mean); + *out_var = utils::cast(var); +} + +void calculate_global(Tensor input, Tensor out_var, Tensor out_mean, int correction) { + bool is_contiguous = true; + auto strides = input->strides(); + auto shape = input->shape(); + auto ndim = input->ndim(); + size_t expected = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected && shape[i] > 1) { + is_contiguous = false; + break; + } + expected *= shape[i]; + } + + auto dtype = input->dtype(); + size_t numel = input->numel(); + auto input_base = input->data(); + +#define DISPATCH_GLOBAL(T_TYPE, ACC_TYPE) \ + if (is_contiguous) { \ + var_mean_global_kernel( \ + reinterpret_cast(input_base), \ + reinterpret_cast(out_var->data()), \ + reinterpret_cast(out_mean->data()), \ + numel, correction); \ + } else { \ + var_mean_global_strided( \ + reinterpret_cast(input_base), \ + reinterpret_cast(out_var->data()), \ + reinterpret_cast(out_mean->data()), \ + shape, strides, numel, correction); \ + } + + if (dtype == DataType::F32) { + DISPATCH_GLOBAL(float, float); + } else if (dtype == DataType::F64) { + DISPATCH_GLOBAL(double, double); + } else if (dtype == DataType::F16) { + DISPATCH_GLOBAL(fp16_t, float); + } else { + throw std::runtime_error("Unsupported dtype for CPU var_mean (Global)."); + } +#undef DISPATCH_GLOBAL +} + +template +void var_mean_reduce_contiguous(const T *input_data, T *out_var, T *out_mean, + const std::vector &shape, int dim, size_t numel, int correction) { + size_t dim_size = shape[dim]; + size_t outer_size = 1; + size_t inner_size = 1; + for (int i = 0; i < dim; ++i) { + outer_size *= shape[i]; + } + for (int i = dim + 1; i < (int)shape.size(); ++i) { + inner_size *= shape[i]; + } + + if (inner_size == 1) { +#pragma omp parallel for + for (size_t i = 0; i < outer_size; ++i) { + const T *row_ptr = input_data + i * dim_size; + AccT sum = 0; + for (size_t j = 0; j < dim_size; ++j) { + sum += utils::cast(row_ptr[j]); + } + AccT mean = sum / dim_size; + AccT sum_sq_diff = 0; + for (size_t j = 0; j < dim_size; ++j) { + AccT diff = utils::cast(row_ptr[j]) - mean; + sum_sq_diff += diff * diff; + } + AccT divisor = (dim_size > (size_t)correction) ? (AccT)(dim_size - correction) : 0; + out_mean[i] = utils::cast(mean); + out_var[i] = utils::cast((divisor > 0) ? sum_sq_diff / divisor : NAN); + } + } else { +#pragma omp parallel for + for (size_t o = 0; o < outer_size; ++o) { + size_t input_base = o * dim_size * inner_size; + size_t output_base = o * inner_size; + for (size_t i = 0; i < inner_size; ++i) { + AccT sum = 0; + for (size_t d = 0; d < dim_size; ++d) { + sum += utils::cast(input_data[input_base + i + d * inner_size]); + } + AccT mean = sum / dim_size; + AccT sum_sq_diff = 0; + for (size_t d = 0; d < dim_size; ++d) { + AccT diff = utils::cast(input_data[input_base + i + d * inner_size]) - mean; + sum_sq_diff += diff * diff; + } + AccT divisor = (dim_size > (size_t)correction) ? (AccT)(dim_size - correction) : 0; + out_mean[output_base + i] = utils::cast(mean); + out_var[output_base + i] = utils::cast((divisor > 0) ? sum_sq_diff / divisor : NAN); + } + } + } +} + +template +void var_mean_reduce_strided(const T *input_base, T *output_var, T *output_mean, + const std::vector &input_shape, + const std::vector &input_strides, + const std::vector &output_shape, + int dim, int correction) { + size_t output_numel = 1; + for (auto s : output_shape) { + output_numel *= s; + } + + size_t dim_size = input_shape[dim]; + int64_t dim_stride = input_strides[dim]; + int ndim = input_shape.size(); + +#pragma omp parallel for + for (size_t out_idx = 0; out_idx < output_numel; ++out_idx) { + size_t temp_idx = out_idx; + size_t input_offset_base = 0; + + for (int i = ndim - 1; i >= 0; --i) { + if (i == dim) { + continue; + } + size_t coord = temp_idx % input_shape[i]; + temp_idx /= input_shape[i]; + input_offset_base += coord * input_strides[i]; + } + + AccT sum = 0; + for (size_t d = 0; d < dim_size; ++d) { + const T *ptr = input_base + (input_offset_base + d * dim_stride); + sum += utils::cast(*ptr); + } + AccT mean = sum / dim_size; + + AccT sum_sq_diff = 0; + for (size_t d = 0; d < dim_size; ++d) { + const T *ptr = input_base + (input_offset_base + d * dim_stride); + AccT val = utils::cast(*ptr); + AccT diff = val - mean; + sum_sq_diff += diff * diff; + } + + AccT divisor = (dim_size > (size_t)correction) ? (AccT)(dim_size - correction) : 0; + AccT var = (divisor > 0) ? (sum_sq_diff / divisor) : NAN; + + output_mean[out_idx] = utils::cast(mean); + output_var[out_idx] = utils::cast(var); + } +} + +void calculate_reduce(Tensor input, Tensor out_var, Tensor out_mean, int dim, int correction, bool keepdim) { + auto ndim = input->ndim(); + if (dim < 0) { + dim = ndim + dim; + } + + bool is_contiguous = true; + auto strides = input->strides(); + auto shape = input->shape(); + size_t expected = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (strides[i] != expected && shape[i] > 1) { + is_contiguous = false; + break; + } + expected *= shape[i]; + } + +#define DISPATCH_REDUCE(T_TYPE, ACC_TYPE) \ + if (is_contiguous) { \ + var_mean_reduce_contiguous( \ + reinterpret_cast(input->data()), \ + reinterpret_cast(out_var->data()), \ + reinterpret_cast(out_mean->data()), \ + shape, dim, input->numel(), correction); \ + } else { \ + var_mean_reduce_strided( \ + reinterpret_cast(input->data()), \ + reinterpret_cast(out_var->data()), \ + reinterpret_cast(out_mean->data()), \ + shape, strides, out_var->shape(), dim, correction); \ + } + + if (input->dtype() == DataType::F32) { + DISPATCH_REDUCE(float, float); + } else if (input->dtype() == DataType::F64) { + DISPATCH_REDUCE(double, double); + } else if (input->dtype() == DataType::F16) { + DISPATCH_REDUCE(fp16_t, float); + } else { + throw std::runtime_error("Unsupported dtype for CPU var_mean (Reduce)."); + } +#undef DISPATCH_REDUCE +} + +static bool registered_global = []() { + VarMeanGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate_global); + return true; +}(); + +static bool registered_reduce = []() { + VarMeanReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate_reduce); + return true; +}(); + +} // namespace infinicore::op::var_mean_impl::cpu \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..55948efa2 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -3,6 +3,7 @@ #include #include "ops/add.hpp" +#include "ops/all.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" @@ -14,7 +15,11 @@ #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" +#include "ops/sum.hpp" #include "ops/swiglu.hpp" +#include "ops/topk.hpp" +#include "ops/var.hpp" +#include "ops/var_mean.hpp" namespace py = pybind11; @@ -34,6 +39,11 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_sum(m); + bind_topk(m); + bind_var(m); + bind_var_mean(m); + bind_all(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/all.hpp b/src/infinicore/pybind11/ops/all.hpp new file mode 100644 index 000000000..3bcd34c85 --- /dev/null +++ b/src/infinicore/pybind11/ops/all.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "infinicore/ops/all.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_all(py::module &m) { + m.def("all_reduce", + &op::all_reduce, + py::arg("input"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(Returns true if all elements in each row of the input tensor in the given dimension are true.)doc"); + + m.def("all_reduce_", + &op::all_reduce_, + py::arg("input"), + py::arg("output"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(In-place version of all_reduce.)doc"); + + m.def("all_global", + &op::all_global, + py::arg("input"), + R"doc(Returns true if all elements in the tensor are true.)doc"); + + m.def("all_global_", + &op::all_global_, + py::arg("input"), + py::arg("output"), + R"doc(In-place global version.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/sum.hpp b/src/infinicore/pybind11/ops/sum.hpp new file mode 100644 index 000000000..8862e3ff1 --- /dev/null +++ b/src/infinicore/pybind11/ops/sum.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "infinicore/ops/sum.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_sum(py::module &m) { + m.def("sum_reduce", + &op::sum_reduce, + py::arg("input"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(Reduces the input tensor along the specified dimension by taking the sum.)doc"); + + m.def("sum_reduce_", + &op::sum_reduce_, + py::arg("input"), + py::arg("output"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(In-place sum reduction along the specified dimension.)doc"); + + m.def("sum_global", + &op::sum_global, + py::arg("input"), + R"doc(Reduces the input tensor globally by taking the sum across all elements.)doc"); + + m.def("sum_global_", + &op::sum_global_, + py::arg("input"), + py::arg("output"), + R"doc(In-place global sum reduction.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/topk.hpp b/src/infinicore/pybind11/ops/topk.hpp new file mode 100644 index 000000000..d1e6987cf --- /dev/null +++ b/src/infinicore/pybind11/ops/topk.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include "infinicore/ops/topk.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_topk(py::module &m) { + m.def("topk", + &op::topk, + py::arg("input"), + py::arg("k"), + py::arg("dim"), + py::arg("largest") = true, + py::arg("sorted") = true, + R"doc(Returns the k largest elements of the given input tensor along a given dimension.)doc"); + + m.def("topk_", + &op::topk_, + py::arg("input"), + py::arg("values"), + py::arg("indices"), + py::arg("k"), + py::arg("dim"), + py::arg("largest") = true, + py::arg("sorted") = true, + R"doc(In-place topk.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/var.hpp b/src/infinicore/pybind11/ops/var.hpp new file mode 100644 index 000000000..7bb7a8285 --- /dev/null +++ b/src/infinicore/pybind11/ops/var.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "infinicore/ops/var.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_var(py::module &m) { + m.def("var_global", + &op::var_global, + py::arg("input"), + py::arg("correction") = 1.0, + R"doc(Global variance.)doc"); + + m.def("var_global_", + &op::var_global_, + py::arg("input"), + py::arg("output"), + py::arg("correction") = 1.0, + R"doc(In-place global variance.)doc"); + + m.def("var_reduce", + &op::var_reduce, + py::arg("input"), + py::arg("dim"), + py::arg("correction") = 1.0, + py::arg("keepdim") = false, + R"doc(Variance reduction along dim.)doc"); + + m.def("var_reduce_", + &op::var_reduce_, + py::arg("input"), + py::arg("output"), + py::arg("dim"), + py::arg("correction") = 1.0, + py::arg("keepdim") = false, + R"doc(In-place variance reduction.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/var_mean.hpp b/src/infinicore/pybind11/ops/var_mean.hpp new file mode 100644 index 000000000..3804675e6 --- /dev/null +++ b/src/infinicore/pybind11/ops/var_mean.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include "infinicore/ops/var_mean.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_var_mean(py::module &m) { + m.def("var_mean_reduce", + &op::var_mean_reduce, + py::arg("input"), + py::arg("dim"), + py::arg("correction"), + py::arg("keepdim") = false, + R"doc(Returns the variance and mean of each row of the input tensor in the given dimension.)doc"); + + m.def("var_mean_reduce_", + &op::var_mean_reduce_, + py::arg("input"), + py::arg("out_var"), + py::arg("out_mean"), + py::arg("dim"), + py::arg("correction"), + py::arg("keepdim") = false, + R"doc(In-place version of var_mean_reduce.)doc"); + + m.def("var_mean_global", + &op::var_mean_global, + py::arg("input"), + py::arg("correction"), + R"doc(Returns the global variance and mean.)doc"); + + m.def("var_mean_global_", + &op::var_mean_global_, + py::arg("input"), + py::arg("out_var"), + py::arg("out_mean"), + py::arg("correction"), + R"doc(In-place global version.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/test/infinicore/ops/all.py b/test/infinicore/ops/all.py index bef8ba48b..4b5e8c8df 100644 --- a/test/infinicore/ops/all.py +++ b/test/infinicore/ops/all.py @@ -110,9 +110,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.all(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.all(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.all(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/sum.py b/test/infinicore/ops/sum.py index 7cf4be80d..20658ba1b 100644 --- a/test/infinicore/ops/sum.py +++ b/test/infinicore/ops/sum.py @@ -77,9 +77,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.sum(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.sum(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.sum(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/topk.py b/test/infinicore/ops/topk.py index b07f9ed7a..f82913a25 100644 --- a/test/infinicore/ops/topk.py +++ b/test/infinicore/ops/topk.py @@ -77,9 +77,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.topk(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.topk(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.topk(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/var.py b/test/infinicore/ops/var.py index 1869085ec..d9777b003 100644 --- a/test/infinicore/ops/var.py +++ b/test/infinicore/ops/var.py @@ -76,9 +76,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.var(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.var(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.var(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/var_mean.py b/test/infinicore/ops/var_mean.py index 5a696fdf9..1c88e31d1 100644 --- a/test/infinicore/ops/var_mean.py +++ b/test/infinicore/ops/var_mean.py @@ -76,9 +76,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.var_mean(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.var_mean(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.var_mean(*args, **kwargs) def main():