Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/c_backend/pipeline_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Pipeline : public Halide::Generator<Pipeline> {
Var x, y;

Func f, h;
f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13;
f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13 + cast<uint16_t>(x % 3.4f + fma(cast<float>(y), 0.5f, 1.2f));
h.define_extern("an_extern_stage", {f}, Int(16), 0, NameMangling::C);
output(x, y) = cast<uint16_t>(max(0, f(y, x) + f(x, y) + an_extern_func(x, y) + h()));

Expand Down
1 change: 1 addition & 0 deletions python_bindings/src/halide/halide_/PyIROperator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ void define_operators(py::module &m) {
m.def("log", &log);
m.def("pow", &pow);
m.def("erf", &erf);
m.def("fma", &fma);
m.def("fast_sin", &fast_sin);
m.def("fast_cos", &fast_cos);
m.def("fast_log", &fast_log);
Expand Down
25 changes: 23 additions & 2 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,12 @@ void CodeGen_C::visit(const Mod *op) {
string arg0 = print_expr(op->a);
string arg1 = print_expr(op->b);
ostringstream rhs;
rhs << "fmod(" << arg0 << ", " << arg1 << ")";
if (op->type.is_scalar()) {
rhs << "::halide_cpp_fmod(";
} else {
rhs << print_type(op->type) << "_ops::fmod(";
}
rhs << arg0 << ", " << arg1 << ")";
print_assignment(op->type, rhs.str());
} else {
visit_binop(op->type, op->a, op->b, "%");
Expand Down Expand Up @@ -1845,8 +1850,24 @@ void CodeGen_C::visit(const Call *op) {
<< " + " << print_expr(base_offset) << "), /*rw*/0, /*locality*/0), 0)";
} else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
rhs << "(sizeof(halide_buffer_t))";
} else if (op->is_intrinsic(Call::strict_fma)) {
internal_assert(op->args.size() == 3)
<< "Wrong number of args for strict_fma: " << op->args.size();
if (op->type.is_scalar()) {
rhs << "::halide_cpp_fma("
<< print_expr(op->args[0]) << ", "
<< print_expr(op->args[1]) << ", "
<< print_expr(op->args[2]) << ")";
} else {
rhs << print_type(op->type) << "_ops::fma("
<< print_expr(op->args[0]) << ", "
<< print_expr(op->args[1]) << ", "
<< print_expr(op->args[2]) << ")";
}
} else if (op->is_strict_float_intrinsic()) {
// This depends on the generated C++ being compiled without -ffast-math
// This depends on the generated C++ being compiled without
// -ffast-math. Note that this would not be correct for strict_fma, so
// we handle it separately above.
Expr equiv = unstrictify_float(op);
rhs << print_expr(equiv);
} else if (op->is_intrinsic()) {
Expand Down
31 changes: 30 additions & 1 deletion src/CodeGen_C_prologue.template.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
/* MACHINE GENERATED By Halide. */

#if !(__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
#error "This code requires C++11 (or later); please upgrade your compiler."
#endif

#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif

#include <assert.h>
#include <fenv.h>
#include <float.h>
Expand Down Expand Up @@ -257,6 +260,32 @@ inline T halide_cpp_min(const T &a, const T &b) {
return (a < b) ? a : b;
}

template<typename T>
inline T halide_cpp_fma(const T &a, const T &b, const T &c) {
#if __has_builtin(__builtin_fma)
return __builtin_fma(a, b, c);
#else
if (sizeof(T) == sizeof(float)) {
return fmaf(a, b, c);
} else {
return (T)fma((double)a, (double)b, (double)c);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious: what's the point of casting? It looks like this would make it accept long double, but actually not respect the required precision (which is hard on SSE fp either way).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was for float16 support. It's not quite right doing it in a wider type though - the rounding on the wider fma might result in a tie when casting back to the narrow type, and that tie may break in a different direction than directly rounding the fma result to the narrow type. Not sure how to handle this. A static assert that T is a double or a float? What should the C backend do if you use a float16 fma call?

}
#endif
}

template<typename T>
inline T halide_cpp_fmod(const T &a, const T &b) {
#if __has_builtin(__builtin_fmod)
return __builtin_fmod(a, b);
#else
if (sizeof(T) == sizeof(float)) {
return fmod(a, b);
} else {
return (T)fmod((double)a, (double)b);
}
#endif
}

template<typename T>
inline void halide_maybe_unused(const T &) {
}
Expand Down
36 changes: 32 additions & 4 deletions src/CodeGen_C_vectors.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
#define __has_attribute(x) 0
#endif

#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif

namespace {

// We can't use std::array because that has its own overload of operator<, etc,
Expand Down Expand Up @@ -150,6 +146,22 @@ class CppVectorOps {
return r;
}

static Vec fma(const Vec &a, const Vec &b, const Vec &c) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
}
return r;
}

static Vec fmod(const Vec &a, const Vec &b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fmod(a[i], b[i]);
}
return r;
}

static Mask logical_or(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
Expand Down Expand Up @@ -734,6 +746,22 @@ class NativeVectorOps {
#endif
}

static Vec fma(const Vec a, const Vec b, const Vec c) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
}
return r;
}

static Vec fmod(const Vec a, const Vec b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fmod(a[i], b[i]);
}
return r;
}

// The relational operators produce signed-int of same width as input; our codegen expects uint8.
static Mask logical_or(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
Expand Down
4 changes: 4 additions & 0 deletions src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
void CodeGen_D3D12Compute_Dev::init_module() {
debug(2) << "D3D12Compute device codegen init_module\n";

// TODO: we could support strict float intrinsics with the precise qualifier
internal_assert(!any_strict_float)
<< "strict float intrinsics not yet supported in d3d12compute backend";

// wipe the internal kernel source
src_stream.str("");
src_stream.clear();
Expand Down
14 changes: 14 additions & 0 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,20 @@ void CodeGen_GPU_C::visit(const Call *op) {
equiv.accept(this);
}
}
} else if (op->is_intrinsic(Call::strict_fma)) {
// All shader languages have fma
Expr equiv = Call::make(op->type, "fma", op->args, Call::PureExtern);
equiv.accept(this);
} else {
CodeGen_C::visit(op);
}
}

void CodeGen_GPU_C::visit(const Mod *op) {
if (op->type.is_float()) {
// All shader languages have fmod
Expr equiv = Call::make(op->type, "fmod", {op->a, op->b}, Call::PureExtern);
equiv.accept(this);
} else {
CodeGen_C::visit(op);
}
Expand Down
10 changes: 10 additions & 0 deletions src/CodeGen_GPU_Dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@ struct CodeGen_GPU_Dev {
Device = 1, // Device/global memory fence
Shared = 2 // Threadgroup/shared memory fence
};

/** Some GPU APIs need to know what floating point mode we're in at kernel
* emission time, to emit appropriate pragmas. */
bool any_strict_float = false;

public:
void set_any_strict_float(bool any_strict_float) {
this->any_strict_float = any_strict_float;
}
};

/** A base class for GPU backends that require C-like shader output.
Expand All @@ -99,6 +108,7 @@ class CodeGen_GPU_C : public CodeGen_C {
using CodeGen_C::visit;
void visit(const Shuffle *op) override;
void visit(const Call *op) override;
void visit(const Mod *op) override;

std::string print_extern_call(const Call *op) override;

Expand Down
68 changes: 49 additions & 19 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3306,28 +3306,52 @@ void CodeGen_LLVM::visit(const Call *op) {
// Evaluate the args first outside the strict scope, as they may use
// non-strict operations.
std::vector<Expr> new_args(op->args.size());
std::vector<std::string> to_pop;
for (size_t i = 0; i < op->args.size(); i++) {
const Expr &arg = op->args[i];
if (arg.as<Variable>() || is_const(arg)) {
new_args[i] = arg;
} else {
std::string name = unique_name('t');
sym_push(name, codegen(arg));
to_pop.push_back(name);
new_args[i] = Variable::make(arg.type(), name);
}
}

Expr call = Call::make(op->type, op->name, new_args, op->call_type);
{
ScopedValue<bool> old_in_strict_float(in_strict_float, true);
value = codegen(unstrictify_float(call.as<Call>()));
if (op->is_intrinsic(Call::strict_fma)) {
if (op->type.is_float() && op->type.bits() <= 16 &&
upgrade_type_for_arithmetic(op->type) != op->type) {
// For (b)float16 and below, doing the fma as a
// double-precision fma is exact and is what llvm does. A
// double has enough bits of precision such that the add in
// the fma has no rounding error in the cases where the fma
// is going to return a finite float16. We do this
// legalization manually so that we can use our custom
// vectorizable float16 casts instead of letting llvm call
// library functions.
Type wide_t = Float(64, op->type.lanes());
for (Expr &e : new_args) {
e = cast(wide_t, e);
}
Expr equiv = Call::make(wide_t, op->name, new_args, op->call_type);
equiv = cast(op->type, equiv);
value = codegen(equiv);
} else {
std::string name = "llvm.fma" + mangle_llvm_type(llvm_type_of(op->type));
value = call_intrin(op->type, op->type.lanes(), name, new_args);
}
} else {
// Lower to something other than a call node
Expr call = Call::make(op->type, op->name, new_args, op->call_type);
value = codegen(unstrictify_float(call.as<Call>()));
}
}

for (size_t i = 0; i < op->args.size(); i++) {
const Expr &arg = op->args[i];
if (!arg.as<Variable>() && !is_const(arg)) {
sym_pop(new_args[i].as<Variable>()->name);
}
for (const auto &s : to_pop) {
sym_pop(s);
}

} else if (is_float16_transcendental(op) && !supports_call_as_float16(op)) {
Expand Down Expand Up @@ -4739,23 +4763,29 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes,
Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
const string &name, vector<Value *> arg_values,
bool scalable_vector_result, bool is_reduction) {
auto fix_vector_lanes_of_type = [&](const llvm::Type *t) {
if (intrin_lanes == 1 || is_reduction) {
return t->getScalarType();
} else {
if (scalable_vector_result && effective_vscale != 0) {
return get_vector_type(result_type->getScalarType(),
intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
} else {
return get_vector_type(result_type->getScalarType(),
intrin_lanes, VectorTypeConstraint::Fixed);
}
}
};

llvm::Function *fn = module->getFunction(name);
if (!fn) {
vector<llvm::Type *> arg_types(arg_values.size());
for (size_t i = 0; i < arg_values.size(); i++) {
arg_types[i] = arg_values[i]->getType();
llvm::Type *t = arg_values[i]->getType();
arg_types[i] = fix_vector_lanes_of_type(t);
}

llvm::Type *intrinsic_result_type = result_type->getScalarType();
if (intrin_lanes > 1 && !is_reduction) {
if (scalable_vector_result && effective_vscale != 0) {
intrinsic_result_type = get_vector_type(result_type->getScalarType(),
intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
} else {
intrinsic_result_type = get_vector_type(result_type->getScalarType(),
intrin_lanes, VectorTypeConstraint::Fixed);
}
}
llvm::Type *intrinsic_result_type = fix_vector_lanes_of_type(result_type);
FunctionType *func_t = FunctionType::get(intrinsic_result_type, arg_types, false);
fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
fn->setCallingConv(CallingConv::C);
Expand Down Expand Up @@ -4790,7 +4820,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
if (arg_i_lanes >= arg_lanes) {
// Horizontally reducing intrinsics may have
// arguments that have more lanes than the
// result. Assume that the horizontally reduce
// result. Assume that they horizontally reduce
// neighboring elements...
int reduce = arg_i_lanes / arg_lanes;
args.push_back(slice_vector(arg_values[i], start * reduce, intrin_lanes * reduce));
Expand Down
1 change: 1 addition & 0 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@ void CodeGen_Metal_Dev::init_module() {

// Write out the Halide math functions.
src_stream << "#pragma clang diagnostic ignored \"-Wunused-function\"\n"
<< "#pragma METAL fp math_mode(" << (any_strict_float ? "safe)\n" : "fast)\n")
<< "#include <metal_stdlib>\n"
<< "using namespace metal;\n" // Seems like the right way to go.
<< "namespace {\n"
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_OpenCL_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ void CodeGen_OpenCL_Dev::init_module() {
// This identifies the program as OpenCL C (as opposed to SPIR).
src_stream << "/*OpenCL C " << target.to_string() << "*/\n";

src_stream << "#pragma OPENCL FP_CONTRACT ON\n";
src_stream << "#pragma OPENCL FP_CONTRACT " << (any_strict_float ? "OFF\n" : "ON\n");

// Write out the Halide math functions.
src_stream << "inline float float_from_bits(unsigned int x) {return as_float(x);}\n"
Expand Down
25 changes: 20 additions & 5 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,12 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
}

void CodeGen_PTX_Dev::init_module() {
// This class uses multiple inheritance. It's a GPU device code generator,
// and also an llvm-based one. Both of these track strict_float presence,
// but OffloadGPULoops only sets the GPU device code generator flag, so here
// we set the CodeGen_LLVM flag to match.
CodeGen_LLVM::any_strict_float = CodeGen_GPU_Dev::any_strict_float;

init_context();

module = get_initial_module_for_ptx_device(target, context);
Expand Down Expand Up @@ -249,6 +255,15 @@ void CodeGen_PTX_Dev::init_module() {
function_does_not_access_memory(fn);
fn->addFnAttr(llvm::Attribute::NoUnwind);
}

if (CodeGen_GPU_Dev::any_strict_float) {
debug(0) << "Setting strict fp math\n";
set_strict_fp_math();
in_strict_float = target.has_feature(Target::StrictFloat);
} else {
debug(0) << "Setting fast fp math\n";
set_fast_fp_math();
}
}

void CodeGen_PTX_Dev::visit(const Call *op) {
Expand Down Expand Up @@ -611,13 +626,13 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";

TargetOptions options;
options.AllowFPOpFusion = FPOpFusion::Fast;
options.AllowFPOpFusion = CodeGen_GPU_Dev::any_strict_float ? llvm::FPOpFusion::Strict : llvm::FPOpFusion::Fast;
#if LLVM_VERSION < 210
options.UnsafeFPMath = true;
options.UnsafeFPMath = !CodeGen_GPU_Dev::any_strict_float;
#endif
options.NoInfsFPMath = true;
options.NoNaNsFPMath = true;
options.HonorSignDependentRoundingFPMathOption = false;
options.NoInfsFPMath = !CodeGen_GPU_Dev::any_strict_float;
options.NoNaNsFPMath = !CodeGen_GPU_Dev::any_strict_float;
options.HonorSignDependentRoundingFPMathOption = !CodeGen_GPU_Dev::any_strict_float;
options.NoZerosInBSS = false;
options.GuaranteedTailCallOpt = false;

Expand Down
Loading
Loading