halide · abadams · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 15, 2025
diff --git a/apps/c_backend/pipeline_generator.cpp b/apps/c_backend/pipeline_generator.cpp
@@ -14,7 +14,7 @@ class Pipeline : public Halide::Generator<Pipeline> {
         Var x, y;
 
         Func f, h;
-        f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13;
+        f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13 + cast<uint16_t>(x % 3.4f + fma(cast<float>(y), 0.5f, 1.2f));
         h.define_extern("an_extern_stage", {f}, Int(16), 0, NameMangling::C);
         output(x, y) = cast<uint16_t>(max(0, f(y, x) + f(x, y) + an_extern_func(x, y) + h()));
 

diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp
@@ -126,6 +126,7 @@ void define_operators(py::module &m) {
     m.def("log", &log);
     m.def("pow", &pow);
     m.def("erf", &erf);
+    m.def("fma", &fma);
     m.def("fast_sin", &fast_sin);
     m.def("fast_cos", &fast_cos);
     m.def("fast_log", &fast_log);

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
@@ -1351,7 +1351,12 @@ void CodeGen_C::visit(const Mod *op) {
         string arg0 = print_expr(op->a);
         string arg1 = print_expr(op->b);
         ostringstream rhs;
-        rhs << "fmod(" << arg0 << ", " << arg1 << ")";
+        if (op->type.is_scalar()) {
+            rhs << "::halide_cpp_fmod(";
+        } else {
+            rhs << print_type(op->type) << "_ops::fmod(";
+        }
+        rhs << arg0 << ", " << arg1 << ")";
         print_assignment(op->type, rhs.str());
     } else {
         visit_binop(op->type, op->a, op->b, "%");
@@ -1845,8 +1850,24 @@ void CodeGen_C::visit(const Call *op) {
             << " + " << print_expr(base_offset) << "), /*rw*/0, /*locality*/0), 0)";
     } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
         rhs << "(sizeof(halide_buffer_t))";
+    } else if (op->is_intrinsic(Call::strict_fma)) {
+        internal_assert(op->args.size() == 3)
+            << "Wrong number of args for strict_fma: " << op->args.size();
+        if (op->type.is_scalar()) {
+            rhs << "::halide_cpp_fma("
+                << print_expr(op->args[0]) << ", "
+                << print_expr(op->args[1]) << ", "
+                << print_expr(op->args[2]) << ")";
+        } else {
+            rhs << print_type(op->type) << "_ops::fma("
+                << print_expr(op->args[0]) << ", "
+                << print_expr(op->args[1]) << ", "
+                << print_expr(op->args[2]) << ")";
+        }
     } else if (op->is_strict_float_intrinsic()) {
-        // This depends on the generated C++ being compiled without -ffast-math
+        // This depends on the generated C++ being compiled without
+        // -ffast-math. Note that this would not be correct for strict_fma, so
+        // we handle it separately above.
         Expr equiv = unstrictify_float(op);
         rhs << print_expr(equiv);
     } else if (op->is_intrinsic()) {

diff --git a/src/CodeGen_C_prologue.template.cpp b/src/CodeGen_C_prologue.template.cpp
@@ -1,9 +1,12 @@
 /* MACHINE GENERATED By Halide. */
-
 #if !(__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
 #error "This code requires C++11 (or later); please upgrade your compiler."
 #endif
 
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0
+#endif
+
 #include <assert.h>
 #include <fenv.h>
 #include <float.h>
@@ -257,6 +260,32 @@ inline T halide_cpp_min(const T &a, const T &b) {
     return (a < b) ? a : b;
 }
 
+template<typename T>
+inline T halide_cpp_fma(const T &a, const T &b, const T &c) {
+#if __has_builtin(__builtin_fma)
+    return __builtin_fma(a, b, c);
+#else
+    if (sizeof(T) == sizeof(float)) {
+        return fmaf(a, b, c);
+    } else {
+        return (T)fma((double)a, (double)b, (double)c);
+    }
+#endif
+}
+
+template<typename T>
+inline T halide_cpp_fmod(const T &a, const T &b) {
+#if __has_builtin(__builtin_fmod)
+    return __builtin_fmod(a, b);
+#else
+    if (sizeof(T) == sizeof(float)) {
+        return fmod(a, b);
+    } else {
+        return (T)fmod((double)a, (double)b);
+    }
+#endif
+}
+
 template<typename T>
 inline void halide_maybe_unused(const T &) {
 }

diff --git a/src/CodeGen_C_vectors.template.cpp b/src/CodeGen_C_vectors.template.cpp
@@ -2,10 +2,6 @@
 #define __has_attribute(x) 0
 #endif
 
-#if !defined(__has_builtin)
-#define __has_builtin(x) 0
-#endif
-
 namespace {
 
 // We can't use std::array because that has its own overload of operator<, etc,
@@ -150,6 +146,22 @@ class CppVectorOps {
         return r;
     }
 
+    static Vec fma(const Vec &a, const Vec &b, const Vec &c) {
+        Vec r;
+        for (size_t i = 0; i < Lanes; i++) {
+            r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
+        }
+        return r;
+    }
+
+    static Vec fmod(const Vec &a, const Vec &b) {
+        Vec r;
+        for (size_t i = 0; i < Lanes; i++) {
+            r[i] = ::halide_cpp_fmod(a[i], b[i]);
+        }
+        return r;
+    }
+
     static Mask logical_or(const Vec &a, const Vec &b) {
         CppVector<uint8_t, Lanes> r;
         for (size_t i = 0; i < Lanes; i++) {
@@ -734,6 +746,22 @@ class NativeVectorOps {
 #endif
     }
 
+    static Vec fma(const Vec a, const Vec b, const Vec c) {
+        Vec r;
+        for (size_t i = 0; i < Lanes; i++) {
+            r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
+        }
+        return r;
+    }
+
+    static Vec fmod(const Vec a, const Vec b) {
+        Vec r;
+        for (size_t i = 0; i < Lanes; i++) {
+            r[i] = ::halide_cpp_fmod(a[i], b[i]);
+        }
+        return r;
+    }
+
     // The relational operators produce signed-int of same width as input; our codegen expects uint8.
     static Mask logical_or(const Vec a, const Vec b) {
         using T = typename NativeVectorComparisonType<ElementType>::type;

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -1257,6 +1257,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
 void CodeGen_D3D12Compute_Dev::init_module() {
     debug(2) << "D3D12Compute device codegen init_module\n";
 
+    // TODO: we could support strict float intrinsics with the precise qualifier
+    internal_assert(!any_strict_float)
+        << "strict float intrinsics not yet supported in d3d12compute backend";
+
     // wipe the internal kernel source
     src_stream.str("");
     src_stream.clear();

diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
@@ -245,6 +245,20 @@ void CodeGen_GPU_C::visit(const Call *op) {
                 equiv.accept(this);
             }
         }
+    } else if (op->is_intrinsic(Call::strict_fma)) {
+        // All shader languages have fma
+        Expr equiv = Call::make(op->type, "fma", op->args, Call::PureExtern);
+        equiv.accept(this);
+    } else {
+        CodeGen_C::visit(op);
+    }
+}
+
+void CodeGen_GPU_C::visit(const Mod *op) {
+    if (op->type.is_float()) {
+        // All shader languages have fmod
+        Expr equiv = Call::make(op->type, "fmod", {op->a, op->b}, Call::PureExtern);
+        equiv.accept(this);
     } else {
         CodeGen_C::visit(op);
     }

diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h
@@ -77,6 +77,15 @@ struct CodeGen_GPU_Dev {
         Device = 1,  // Device/global memory fence
         Shared = 2   // Threadgroup/shared memory fence
     };
+
+    /** Some GPU APIs need to know what floating point mode we're in at kernel
+     * emission time, to emit appropriate pragmas. */
+    bool any_strict_float = false;
+
+public:
+    void set_any_strict_float(bool any_strict_float) {
+        this->any_strict_float = any_strict_float;
+    }
 };
 
 /** A base class for GPU backends that require C-like shader output.
@@ -99,6 +108,7 @@ class CodeGen_GPU_C : public CodeGen_C {
     using CodeGen_C::visit;
     void visit(const Shuffle *op) override;
     void visit(const Call *op) override;
+    void visit(const Mod *op) override;
 
     std::string print_extern_call(const Call *op) override;
 

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -3306,28 +3306,52 @@ void CodeGen_LLVM::visit(const Call *op) {
         // Evaluate the args first outside the strict scope, as they may use
         // non-strict operations.
         std::vector<Expr> new_args(op->args.size());
+        std::vector<std::string> to_pop;
         for (size_t i = 0; i < op->args.size(); i++) {
             const Expr &arg = op->args[i];
             if (arg.as<Variable>() || is_const(arg)) {
                 new_args[i] = arg;
             } else {
                 std::string name = unique_name('t');
                 sym_push(name, codegen(arg));
+                to_pop.push_back(name);
                 new_args[i] = Variable::make(arg.type(), name);
             }
         }
 
-        Expr call = Call::make(op->type, op->name, new_args, op->call_type);
         {
             ScopedValue<bool> old_in_strict_float(in_strict_float, true);
-            value = codegen(unstrictify_float(call.as<Call>()));
+            if (op->is_intrinsic(Call::strict_fma)) {
+                if (op->type.is_float() && op->type.bits() <= 16 &&
+                    upgrade_type_for_arithmetic(op->type) != op->type) {
+                    // For (b)float16 and below, doing the fma as a
+                    // double-precision fma is exact and is what llvm does. A
+                    // double has enough bits of precision such that the add in
+                    // the fma has no rounding error in the cases where the fma
+                    // is going to return a finite float16. We do this
+                    // legalization manually so that we can use our custom
+                    // vectorizable float16 casts instead of letting llvm call
+                    // library functions.
+                    Type wide_t = Float(64, op->type.lanes());
+                    for (Expr &e : new_args) {
+                        e = cast(wide_t, e);
+                    }
+                    Expr equiv = Call::make(wide_t, op->name, new_args, op->call_type);
+                    equiv = cast(op->type, equiv);
+                    value = codegen(equiv);
+                } else {
+                    std::string name = "llvm.fma" + mangle_llvm_type(llvm_type_of(op->type));
+                    value = call_intrin(op->type, op->type.lanes(), name, new_args);
+                }
+            } else {
+                // Lower to something other than a call node
+                Expr call = Call::make(op->type, op->name, new_args, op->call_type);
+                value = codegen(unstrictify_float(call.as<Call>()));
+            }
         }
 
-        for (size_t i = 0; i < op->args.size(); i++) {
-            const Expr &arg = op->args[i];
-            if (!arg.as<Variable>() && !is_const(arg)) {
-                sym_pop(new_args[i].as<Variable>()->name);
-            }
+        for (const auto &s : to_pop) {
+            sym_pop(s);
         }
 
     } else if (is_float16_transcendental(op) && !supports_call_as_float16(op)) {
@@ -4739,23 +4763,29 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes,
 Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
                                  const string &name, vector<Value *> arg_values,
                                  bool scalable_vector_result, bool is_reduction) {
+    auto fix_vector_lanes_of_type = [&](const llvm::Type *t) {
+        if (intrin_lanes == 1 || is_reduction) {
+            return t->getScalarType();
+        } else {
+            if (scalable_vector_result && effective_vscale != 0) {
+                return get_vector_type(result_type->getScalarType(),
+                                       intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
+            } else {
+                return get_vector_type(result_type->getScalarType(),
+                                       intrin_lanes, VectorTypeConstraint::Fixed);
+            }
+        }
+    };
+
     llvm::Function *fn = module->getFunction(name);
     if (!fn) {
         vector<llvm::Type *> arg_types(arg_values.size());
         for (size_t i = 0; i < arg_values.size(); i++) {
-            arg_types[i] = arg_values[i]->getType();
+            llvm::Type *t = arg_values[i]->getType();
+            arg_types[i] = fix_vector_lanes_of_type(t);
         }
 
-        llvm::Type *intrinsic_result_type = result_type->getScalarType();
-        if (intrin_lanes > 1 && !is_reduction) {
-            if (scalable_vector_result && effective_vscale != 0) {
-                intrinsic_result_type = get_vector_type(result_type->getScalarType(),
-                                                        intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
-            } else {
-                intrinsic_result_type = get_vector_type(result_type->getScalarType(),
-                                                        intrin_lanes, VectorTypeConstraint::Fixed);
-            }
-        }
+        llvm::Type *intrinsic_result_type = fix_vector_lanes_of_type(result_type);
         FunctionType *func_t = FunctionType::get(intrinsic_result_type, arg_types, false);
         fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
         fn->setCallingConv(CallingConv::C);
@@ -4790,7 +4820,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
                 if (arg_i_lanes >= arg_lanes) {
                     // Horizontally reducing intrinsics may have
                     // arguments that have more lanes than the
-                    // result. Assume that the horizontally reduce
+                    // result. Assume that they horizontally reduce
                     // neighboring elements...
                     int reduce = arg_i_lanes / arg_lanes;
                     args.push_back(slice_vector(arg_values[i], start * reduce, intrin_lanes * reduce));

diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
@@ -834,6 +834,7 @@ void CodeGen_Metal_Dev::init_module() {
 
     // Write out the Halide math functions.
     src_stream << "#pragma clang diagnostic ignored \"-Wunused-function\"\n"
+               << "#pragma METAL fp math_mode(" << (any_strict_float ? "safe)\n" : "fast)\n")
                << "#include <metal_stdlib>\n"
                << "using namespace metal;\n"  // Seems like the right way to go.
                << "namespace {\n"

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
@@ -1123,7 +1123,7 @@ void CodeGen_OpenCL_Dev::init_module() {
     // This identifies the program as OpenCL C (as opposed to SPIR).
     src_stream << "/*OpenCL C " << target.to_string() << "*/\n";
 
-    src_stream << "#pragma OPENCL FP_CONTRACT ON\n";
+    src_stream << "#pragma OPENCL FP_CONTRACT " << (any_strict_float ? "OFF\n" : "ON\n");
 
     // Write out the Halide math functions.
     src_stream << "inline float float_from_bits(unsigned int x) {return as_float(x);}\n"

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -220,6 +220,12 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 }
 
 void CodeGen_PTX_Dev::init_module() {
+    // This class uses multiple inheritance. It's a GPU device code generator,
+    // and also an llvm-based one. Both of these track strict_float presence,
+    // but OffloadGPULoops only sets the GPU device code generator flag, so here
+    // we set the CodeGen_LLVM flag to match.
+    CodeGen_LLVM::any_strict_float = CodeGen_GPU_Dev::any_strict_float;
+
     init_context();
 
     module = get_initial_module_for_ptx_device(target, context);
@@ -249,6 +255,15 @@ void CodeGen_PTX_Dev::init_module() {
         function_does_not_access_memory(fn);
         fn->addFnAttr(llvm::Attribute::NoUnwind);
     }
+
+    if (CodeGen_GPU_Dev::any_strict_float) {
+        debug(0) << "Setting strict fp math\n";
+        set_strict_fp_math();
+        in_strict_float = target.has_feature(Target::StrictFloat);
+    } else {
+        debug(0) << "Setting fast fp math\n";
+        set_fast_fp_math();
+    }
 }
 
 void CodeGen_PTX_Dev::visit(const Call *op) {
@@ -611,13 +626,13 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";
 
     TargetOptions options;
-    options.AllowFPOpFusion = FPOpFusion::Fast;
+    options.AllowFPOpFusion = CodeGen_GPU_Dev::any_strict_float ? llvm::FPOpFusion::Strict : llvm::FPOpFusion::Fast;
 #if LLVM_VERSION < 210
-    options.UnsafeFPMath = true;
+    options.UnsafeFPMath = !CodeGen_GPU_Dev::any_strict_float;
 #endif
-    options.NoInfsFPMath = true;
-    options.NoNaNsFPMath = true;
-    options.HonorSignDependentRoundingFPMathOption = false;
+    options.NoInfsFPMath = !CodeGen_GPU_Dev::any_strict_float;
+    options.NoNaNsFPMath = !CodeGen_GPU_Dev::any_strict_float;
+    options.HonorSignDependentRoundingFPMathOption = !CodeGen_GPU_Dev::any_strict_float;
     options.NoZerosInBSS = false;
     options.GuaranteedTailCallOpt = false;