[libclc] Move fma to the CLC library (#126052)

This builtin is a little more involved than others as targets deal with fma in various different ways. Fundamentally, the CLC __clc_fma builtin compiles to __builtin_elementwise_fma, which compiles to the @llvm.fma intrinsic. However, in the case of fp32 fma some targets call the __clc_sw_fma function, which provides a software implementation of the builtin. This in principle is controlled by the __CLC_HAVE_HW_FMA32 macro and may be a runtime decision, depending on how the target defines that macro. All targets build the CLC fma functions for all types. This is to the CLC library can have a reliable internal implementation for its own purposes. For AMD/NVPTX targets there are no meaningful changes to the generated LLVM bytecode. Some blocks of code have moved around, which confounds llvm-diff. For the clspv and SPIR-V/Mesa targets, only fp32 fma is of interest. Its use in libclc is tightly controlled by checking __CLC_HAVE_HW_FMA32 first. This can either be a compile-time constant (1, for clspv) or a runtime function for SPIR-V/Mesa. The SPIR-V/Mesa target only provided fp32 fma in the OpenCL layer. It unconditionally mapped that to the __clc_sw_fma builtin, even though the generic version in theory had a runtime toggle through __CLC_HAVE_HW_FMA32 specifically for that target. Callers of fma, though, would end up using the ExtInst fma, *not* calling the _Z3fmafff function provided by libclc. This commit keeps this system in place in the OpenCL layer, by mapping fma to __clc_sw_fma. Where other builtins would previously call fma (i.e., result in the ExtInst), they now call __clc_fma. This function checks the __CLC_HAVE_HW_FMA32 runtime toggle, which selects between the slow version or the quick version. The quick version is the LLVM fma intrinsic which llvm-spirv translates to the ExtInst. The clspv target had its own software implementation of fp32 fma, which it called unconditionally - even though __CLC_HAVE_HW_FMA32 is 1 for that target. This is potentially just so its library ships a software version which it can fall back on. In the OpenCL layer, the target doesn't provide fp64 fma, and maps fp16 fma to fp32 mad. This commit keeps this system roughly in place: in the OpenCL layer it maps fp32 fma to __clc_sw_fma, and fp16 fma to mad. Where builtins would previously call into fma, they now call __clc_fma, which compiles to the LLVM intrinsic. If this goes through a translation to SPIR-V it will become the fma ExtInst, or the intrinsic could be replaced by the _Z3fmafff software implementation. The clspv and SPIR-V/Mesa targets could potentially be cleaned up later, depending on their needs.
2025-04-16 22:16:34 +00:00 · 2025-02-24 10:10:51 +00:00 · 2025-02-24 10:10:51 +00:00 · e7ad07ffb8
commit e7ad07ffb8
parent 3532651b6f
29 changed files with 538 additions and 443 deletions
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@ -28,6 +28,8 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
  spirv/lib/SOURCES;
  # CLC internal libraries
  clc/lib/generic/SOURCES;
+  clc/lib/clspv/SOURCES;
+  clc/lib/spirv/SOURCES;
 )

 set( LIBCLC_MIN_LLVM 3.9.0 )
--- a/libclc/clc/include/clc/internal/math/clc_sw_fma.h
+++ b/libclc/clc/include/clc/internal/math/clc_sw_fma.h
@ -1,11 +1,12 @@
-#define __CLC_FUNCTION __clc_fma
-#define __CLC_INTRINSIC "llvm.fma"
-#include "math/ternary_intrin.inc"
+#ifndef __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
+#define __CLC_INTERNAL_MATH_CLC_SW_FMA_H__

-#define __FLOAT_ONLY
 #define __CLC_FUNCTION __clc_sw_fma
 #define __CLC_BODY <clc/shared/ternary_decl.inc>
+
 #include <clc/math/gentype.inc>
+
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
-#undef __FLOAT_ONLY
+
+#endif // __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
--- a/libclc/clc/include/clc/math/clc_fma.h
+++ b/libclc/clc/include/clc/math/clc_fma.h
@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_FMA_H__
+#define __CLC_MATH_CLC_FMA_H__
+
+#define __CLC_FUNCTION __clc_fma
+#define __CLC_BODY <clc/shared/ternary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_FMA_H__
--- a/libclc/clc/include/clc/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@ -39,12 +39,12 @@
 #define PINF 0x200

 #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__
-#define HAVE_HW_FMA32() (0)
+#define __CLC_HAVE_HW_FMA32() (0)
 #elif defined(CLC_SPIRV)
 bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
-#define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
+#define __CLC_HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
 #else
-#define HAVE_HW_FMA32() (1)
+#define __CLC_HAVE_HW_FMA32() (1)
 #endif

 #define HAVE_BITALIGN() (0)
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@ -0,0 +1 @@
+math/clc_sw_fma.cl
--- a/libclc/clc/lib/clspv/math/clc_sw_fma.cl
+++ b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// This version is derived from the generic fma software implementation
+// (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
+// been updated as appropriate.
+
+#include <clc/clc_as_type.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/integer/clc_abs.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/integer/clc_hadd.h>
+#include <clc/integer/clc_mul_hi.h>
+#include <clc/integer/definitions.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_max.h>
+
+struct fp {
+  uint2 mantissa;
+  int exponent;
+  uint sign;
+};
+
+static uint2 u2_set(uint hi, uint lo) {
+  uint2 res;
+  res.lo = lo;
+  res.hi = hi;
+  return res;
+}
+
+static uint2 u2_set_u(uint val) { return u2_set(0, val); }
+
+static uint2 u2_mul(uint a, uint b) {
+  uint2 res;
+  res.hi = __clc_mul_hi(a, b);
+  res.lo = a * b;
+  return res;
+}
+
+static uint2 u2_sll(uint2 val, uint shift) {
+  if (shift == 0)
+    return val;
+  if (shift < 32) {
+    val.hi <<= shift;
+    val.hi |= val.lo >> (32 - shift);
+    val.lo <<= shift;
+  } else {
+    val.hi = val.lo << (shift - 32);
+    val.lo = 0;
+  }
+  return val;
+}
+
+static uint2 u2_srl(uint2 val, uint shift) {
+  if (shift == 0)
+    return val;
+  if (shift < 32) {
+    val.lo >>= shift;
+    val.lo |= val.hi << (32 - shift);
+    val.hi >>= shift;
+  } else {
+    val.lo = val.hi >> (shift - 32);
+    val.hi = 0;
+  }
+  return val;
+}
+
+static uint2 u2_or(uint2 a, uint b) {
+  a.lo |= b;
+  return a;
+}
+
+static uint2 u2_and(uint2 a, uint2 b) {
+  a.lo &= b.lo;
+  a.hi &= b.hi;
+  return a;
+}
+
+static uint2 u2_add(uint2 a, uint2 b) {
+  uint carry = (__clc_hadd(a.lo, b.lo) >> 31) & 0x1;
+  a.lo += b.lo;
+  a.hi += b.hi + carry;
+  return a;
+}
+
+static uint2 u2_add_u(uint2 a, uint b) { return u2_add(a, u2_set_u(b)); }
+
+static uint2 u2_inv(uint2 a) {
+  a.lo = ~a.lo;
+  a.hi = ~a.hi;
+  return u2_add_u(a, 1);
+}
+
+static uint u2_clz(uint2 a) {
+  uint leading_zeroes = __clc_clz(a.hi);
+  if (leading_zeroes == 32) {
+    leading_zeroes += __clc_clz(a.lo);
+  }
+  return leading_zeroes;
+}
+
+static bool u2_eq(uint2 a, uint2 b) { return a.lo == b.lo && a.hi == b.hi; }
+
+static bool u2_zero(uint2 a) { return u2_eq(a, u2_set_u(0)); }
+
+static bool u2_gt(uint2 a, uint2 b) {
+  return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
+}
+
+_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
+  /* special cases */
+  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
+      __clc_isinf(b)) {
+    return __clc_mad(a, b, c);
+  }
+
+  /* If only c is inf, and both a,b are regular numbers, the result is c*/
+  if (__clc_isinf(c)) {
+    return c;
+  }
+
+  a = __clc_flush_denormal_if_not_supported(a);
+  b = __clc_flush_denormal_if_not_supported(b);
+  c = __clc_flush_denormal_if_not_supported(c);
+
+  if (a == 0.0f || b == 0.0f) {
+    return c;
+  }
+
+  if (c == 0) {
+    return a * b;
+  }
+
+  struct fp st_a, st_b, st_c;
+
+  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
+  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
+  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;
+
+  st_a.mantissa =
+      u2_set_u(a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000);
+  st_b.mantissa =
+      u2_set_u(b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000);
+  st_c.mantissa =
+      u2_set_u(c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000);
+
+  st_a.sign = __clc_as_uint(a) & 0x80000000;
+  st_b.sign = __clc_as_uint(b) & 0x80000000;
+  st_c.sign = __clc_as_uint(c) & 0x80000000;
+
+  // Multiplication.
+  // Move the product to the highest bits to maximize precision
+  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
+  // Add one bit for future addition overflow,
+  // add another bit to detect subtraction underflow
+  struct fp st_mul;
+  st_mul.sign = st_a.sign ^ st_b.sign;
+  st_mul.mantissa = u2_sll(u2_mul(st_a.mantissa.lo, st_b.mantissa.lo), 14);
+  st_mul.exponent =
+      !u2_zero(st_mul.mantissa) ? st_a.exponent + st_b.exponent : 0;
+
+  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
+  if (st_mul.exponent == 0 && u2_zero(st_mul.mantissa))
+    return c;
+
+// Mantissa is 23 fractional bits, shift it the same way as product mantissa
+#define C_ADJUST 37ul
+
+  // both exponents are bias adjusted
+  int exp_diff = st_mul.exponent - st_c.exponent;
+
+  st_c.mantissa = u2_sll(st_c.mantissa, C_ADJUST);
+  uint2 cutoff_bits = u2_set_u(0);
+  uint2 cutoff_mask = u2_add(u2_sll(u2_set_u(1), __clc_abs(exp_diff)),
+                             u2_set(0xffffffff, 0xffffffff));
+  if (exp_diff > 0) {
+    cutoff_bits =
+        exp_diff >= 64 ? st_c.mantissa : u2_and(st_c.mantissa, cutoff_mask);
+    st_c.mantissa =
+        exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_c.mantissa, exp_diff);
+  } else {
+    cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa
+                                  : u2_and(st_mul.mantissa, cutoff_mask);
+    st_mul.mantissa =
+        -exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_mul.mantissa, -exp_diff);
+  }
+
+  struct fp st_fma;
+  st_fma.sign = st_mul.sign;
+  st_fma.exponent = __clc_max(st_mul.exponent, st_c.exponent);
+  if (st_c.sign == st_mul.sign) {
+    st_fma.mantissa = u2_add(st_mul.mantissa, st_c.mantissa);
+  } else {
+    // cutoff bits borrow one
+    st_fma.mantissa =
+        u2_add(u2_add(st_mul.mantissa, u2_inv(st_c.mantissa)),
+               (!u2_zero(cutoff_bits) && (st_mul.exponent > st_c.exponent)
+                    ? u2_set(0xffffffff, 0xffffffff)
+                    : u2_set_u(0)));
+  }
+
+  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
+  if (u2_gt(st_fma.mantissa, u2_set(0x7fffffff, 0xffffffff))) {
+    st_fma.mantissa = u2_inv(st_fma.mantissa);
+    st_fma.sign = st_mul.sign ^ 0x80000000;
+  }
+
+  // detect overflow/underflow
+  int overflow_bits = 3 - u2_clz(st_fma.mantissa);
+
+  // adjust exponent
+  st_fma.exponent += overflow_bits;
+
+  // handle underflow
+  if (overflow_bits < 0) {
+    st_fma.mantissa = u2_sll(st_fma.mantissa, -overflow_bits);
+    overflow_bits = 0;
+  }
+
+  // rounding
+  uint2 trunc_mask = u2_add(u2_sll(u2_set_u(1), C_ADJUST + overflow_bits),
+                            u2_set(0xffffffff, 0xffffffff));
+  uint2 trunc_bits =
+      u2_or(u2_and(st_fma.mantissa, trunc_mask), !u2_zero(cutoff_bits));
+  uint2 last_bit =
+      u2_and(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
+  uint2 grs_bits = u2_sll(u2_set_u(4), C_ADJUST - 3 + overflow_bits);
+
+  // round to nearest even
+  if (u2_gt(trunc_bits, grs_bits) ||
+      (u2_eq(trunc_bits, grs_bits) && !u2_zero(last_bit))) {
+    st_fma.mantissa =
+        u2_add(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
+  }
+
+  // Shift mantissa back to bit 23
+  st_fma.mantissa = u2_srl(st_fma.mantissa, C_ADJUST + overflow_bits);
+
+  // Detect rounding overflow
+  if (u2_gt(st_fma.mantissa, u2_set_u(0xffffff))) {
+    ++st_fma.exponent;
+    st_fma.mantissa = u2_srl(st_fma.mantissa, 1);
+  }
+
+  if (u2_zero(st_fma.mantissa)) {
+    return 0.0f;
+  }
+
+  // Flating point range limit
+  if (st_fma.exponent > 127) {
+    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
+  }
+
+  // Flush denormals
+  if (st_fma.exponent <= -127) {
+    return __clc_as_float(st_fma.sign);
+  }
+
+  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+                        ((uint)st_fma.mantissa.lo & 0x7fffff));
+}
+
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
+                       float, float)
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@ -20,12 +20,14 @@ integer/clc_upsample.cl
 math/clc_ceil.cl
 math/clc_copysign.cl
 math/clc_fabs.cl
+math/clc_fma.cl
 math/clc_floor.cl
 math/clc_frexp.cl
 math/clc_mad.cl
 math/clc_modf.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
+math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
 relational/clc_any.cl
--- a/libclc/clc/lib/generic/math/clc_fma.cl
+++ b/libclc/clc/lib/generic/math/clc_fma.cl
@ -0,0 +1,6 @@
+#include <clc/internal/clc.h>
+#include <clc/internal/math/clc_sw_fma.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc_fma.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_fma.inc
+++ b/libclc/clc/lib/generic/math/clc_fma.inc
@ -0,0 +1,8 @@
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_fma(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                               __CLC_GENTYPE c) {
+#if __CLC_FPSIZE == 32
+  if (!__CLC_HAVE_HW_FMA32())
+    return __clc_sw_fma(a, b, c);
+#endif
+  return __builtin_elementwise_fma(a, b, c);
+}
--- a/libclc/clc/lib/generic/math/clc_sw_fma.cl
+++ b/libclc/clc/lib/generic/math/clc_sw_fma.cl
@ -20,11 +20,14 @@
 * THE SOFTWARE.
 */

-#include <clc/clc.h>
+#include <clc/clc_as_type.h>
 #include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
 #include <clc/integer/clc_abs.h>
 #include <clc/integer/clc_clz.h>
-#include <clc/math/clc_subnormal_config.h>
+#include <clc/integer/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_mad.h>
 #include <clc/math/math.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
@ -39,33 +42,36 @@ struct fp {
 _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
  /* special cases */
  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
-      __clc_isinf(b))
-    return mad(a, b, c);
+      __clc_isinf(b)) {
+    return __clc_mad(a, b, c);
+  }

  /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (__clc_isinf(c))
+  if (__clc_isinf(c)) {
    return c;
+  }

  a = __clc_flush_denormal_if_not_supported(a);
  b = __clc_flush_denormal_if_not_supported(b);
  c = __clc_flush_denormal_if_not_supported(c);

-  if (c == 0)
+  if (c == 0) {
    return a * b;
+  }

  struct fp st_a, st_b, st_c;

-  st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
+  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
+  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
+  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;

-  st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
-  st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
-  st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
+  st_a.mantissa = a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000;
+  st_b.mantissa = b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000;
+  st_c.mantissa = c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000;

-  st_a.sign = as_uint(a) & 0x80000000;
-  st_b.sign = as_uint(b) & 0x80000000;
-  st_c.sign = as_uint(c) & 0x80000000;
+  st_a.sign = __clc_as_uint(a) & 0x80000000;
+  st_b.sign = __clc_as_uint(b) & 0x80000000;
+  st_c.sign = __clc_as_uint(c) & 0x80000000;

  // Multiplication.
  // Move the product to the highest bits to maximize precision
@ -137,8 +143,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
  ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));

  // round to nearest even
-  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0))
+  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0)) {
    st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
+  }

  // Shift mantissa back to bit 23
  st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
@ -149,19 +156,23 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
    st_fma.mantissa >>= 1;
  }

-  if (st_fma.mantissa == 0)
+  if (st_fma.mantissa == 0) {
    return .0f;
+  }

  // Flating point range limit
-  if (st_fma.exponent > 127)
-    return as_float(as_uint(INFINITY) | st_fma.sign);
+  if (st_fma.exponent > 127) {
+    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
+  }

  // Flush denormals
-  if (st_fma.exponent <= -127)
-    return as_float(st_fma.sign);
+  if (st_fma.exponent <= -127) {
+    return __clc_as_float(st_fma.sign);
+  }

-  return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                  ((uint)st_fma.mantissa & 0x7fffff));
+  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+                        ((uint)st_fma.mantissa & 0x7fffff));
 }
+
 _CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
                       float, float)
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@ -0,0 +1 @@
+math/clc_runtime_has_hw_fma32.cl
--- a/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
+++ b/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
@ -0,0 +1 @@
+bool __clc_runtime_has_hw_fma32() { return false; }
--- a/libclc/clspv/lib/math/fma.cl
+++ b/libclc/clspv/lib/math/fma.cl
@ -1,274 +1,8 @@
-/*
- * Copyright (c) 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-// This version is derived from the generic fma software implementation
-// (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
-// been updated as appropriate.
-
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-#include <clc/math/math.h>
+#include <clc/internal/math/clc_sw_fma.h>

-struct fp {
-  uint2 mantissa;
-  int exponent;
-  uint sign;
-};
-
-static uint2 u2_set(uint hi, uint lo) {
-  uint2 res;
-  res.lo = lo;
-  res.hi = hi;
-  return res;
-}
-
-static uint2 u2_set_u(uint val) { return u2_set(0, val); }
-
-static uint2 u2_mul(uint a, uint b) {
-  uint2 res;
-  res.hi = mul_hi(a, b);
-  res.lo = a * b;
-  return res;
-}
-
-static uint2 u2_sll(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.hi <<= shift;
-    val.hi |= val.lo >> (32 - shift);
-    val.lo <<= shift;
-  } else {
-    val.hi = val.lo << (shift - 32);
-    val.lo = 0;
-  }
-  return val;
-}
-
-static uint2 u2_srl(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.lo >>= shift;
-    val.lo |= val.hi << (32 - shift);
-    val.hi >>= shift;
-  } else {
-    val.lo = val.hi >> (shift - 32);
-    val.hi = 0;
-  }
-  return val;
-}
-
-static uint2 u2_or(uint2 a, uint b) {
-  a.lo |= b;
-  return a;
-}
-
-static uint2 u2_and(uint2 a, uint2 b) {
-  a.lo &= b.lo;
-  a.hi &= b.hi;
-  return a;
-}
-
-static uint2 u2_add(uint2 a, uint2 b) {
-  uint carry = (hadd(a.lo, b.lo) >> 31) & 0x1;
-  a.lo += b.lo;
-  a.hi += b.hi + carry;
-  return a;
-}
-
-static uint2 u2_add_u(uint2 a, uint b) { return u2_add(a, u2_set_u(b)); }
-
-static uint2 u2_inv(uint2 a) {
-  a.lo = ~a.lo;
-  a.hi = ~a.hi;
-  return u2_add_u(a, 1);
-}
-
-static uint u2_clz(uint2 a) {
-  uint leading_zeroes = clz(a.hi);
-  if (leading_zeroes == 32) {
-    leading_zeroes += clz(a.lo);
-  }
-  return leading_zeroes;
-}
-
-static bool u2_eq(uint2 a, uint2 b) { return a.lo == b.lo && a.hi == b.hi; }
-
-static bool u2_zero(uint2 a) { return u2_eq(a, u2_set_u(0)); }
-
-static bool u2_gt(uint2 a, uint2 b) {
-  return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
-}
-
-_CLC_DEF _CLC_OVERLOAD float fma(float a, float b, float c) {
-  /* special cases */
-  if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b)) {
-    return mad(a, b, c);
-  }
-
-  /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (isinf(c)) {
-    return c;
-  }
-
-  a = __clc_flush_denormal_if_not_supported(a);
-  b = __clc_flush_denormal_if_not_supported(b);
-  c = __clc_flush_denormal_if_not_supported(c);
-
-  if (a == 0.0f || b == 0.0f) {
-    return c;
-  }
-
-  if (c == 0) {
-    return a * b;
-  }
-
-  struct fp st_a, st_b, st_c;
-
-  st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
-
-  st_a.mantissa = u2_set_u(a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000);
-  st_b.mantissa = u2_set_u(b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000);
-  st_c.mantissa = u2_set_u(c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000);
-
-  st_a.sign = as_uint(a) & 0x80000000;
-  st_b.sign = as_uint(b) & 0x80000000;
-  st_c.sign = as_uint(c) & 0x80000000;
-
-  // Multiplication.
-  // Move the product to the highest bits to maximize precision
-  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
-  // Add one bit for future addition overflow,
-  // add another bit to detect subtraction underflow
-  struct fp st_mul;
-  st_mul.sign = st_a.sign ^ st_b.sign;
-  st_mul.mantissa = u2_sll(u2_mul(st_a.mantissa.lo, st_b.mantissa.lo), 14);
-  st_mul.exponent =
-      !u2_zero(st_mul.mantissa) ? st_a.exponent + st_b.exponent : 0;
-
-  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
-  if (st_mul.exponent == 0 && u2_zero(st_mul.mantissa))
-    return c;
-
-// Mantissa is 23 fractional bits, shift it the same way as product mantissa
-#define C_ADJUST 37ul
-
-  // both exponents are bias adjusted
-  int exp_diff = st_mul.exponent - st_c.exponent;
-
-  st_c.mantissa = u2_sll(st_c.mantissa, C_ADJUST);
-  uint2 cutoff_bits = u2_set_u(0);
-  uint2 cutoff_mask = u2_add(u2_sll(u2_set_u(1), abs(exp_diff)),
-                             u2_set(0xffffffff, 0xffffffff));
-  if (exp_diff > 0) {
-    cutoff_bits =
-        exp_diff >= 64 ? st_c.mantissa : u2_and(st_c.mantissa, cutoff_mask);
-    st_c.mantissa =
-        exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_c.mantissa, exp_diff);
-  } else {
-    cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa
-                                  : u2_and(st_mul.mantissa, cutoff_mask);
-    st_mul.mantissa =
-        -exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_mul.mantissa, -exp_diff);
-  }
-
-  struct fp st_fma;
-  st_fma.sign = st_mul.sign;
-  st_fma.exponent = max(st_mul.exponent, st_c.exponent);
-  if (st_c.sign == st_mul.sign) {
-    st_fma.mantissa = u2_add(st_mul.mantissa, st_c.mantissa);
-  } else {
-    // cutoff bits borrow one
-    st_fma.mantissa =
-        u2_add(u2_add(st_mul.mantissa, u2_inv(st_c.mantissa)),
-               (!u2_zero(cutoff_bits) && (st_mul.exponent > st_c.exponent)
-                    ? u2_set(0xffffffff, 0xffffffff)
-                    : u2_set_u(0)));
-  }
-
-  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
-  if (u2_gt(st_fma.mantissa, u2_set(0x7fffffff, 0xffffffff))) {
-    st_fma.mantissa = u2_inv(st_fma.mantissa);
-    st_fma.sign = st_mul.sign ^ 0x80000000;
-  }
-
-  // detect overflow/underflow
-  int overflow_bits = 3 - u2_clz(st_fma.mantissa);
-
-  // adjust exponent
-  st_fma.exponent += overflow_bits;
-
-  // handle underflow
-  if (overflow_bits < 0) {
-    st_fma.mantissa = u2_sll(st_fma.mantissa, -overflow_bits);
-    overflow_bits = 0;
-  }
-
-  // rounding
-  uint2 trunc_mask = u2_add(u2_sll(u2_set_u(1), C_ADJUST + overflow_bits),
-                            u2_set(0xffffffff, 0xffffffff));
-  uint2 trunc_bits =
-      u2_or(u2_and(st_fma.mantissa, trunc_mask), !u2_zero(cutoff_bits));
-  uint2 last_bit =
-      u2_and(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  uint2 grs_bits = u2_sll(u2_set_u(4), C_ADJUST - 3 + overflow_bits);
-
-  // round to nearest even
-  if (u2_gt(trunc_bits, grs_bits) ||
-      (u2_eq(trunc_bits, grs_bits) && !u2_zero(last_bit))) {
-    st_fma.mantissa =
-        u2_add(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  }
-
-  // Shift mantissa back to bit 23
-  st_fma.mantissa = u2_srl(st_fma.mantissa, C_ADJUST + overflow_bits);
-
-  // Detect rounding overflow
-  if (u2_gt(st_fma.mantissa, u2_set_u(0xffffff))) {
-    ++st_fma.exponent;
-    st_fma.mantissa = u2_srl(st_fma.mantissa, 1);
-  }
-
-  if (u2_zero(st_fma.mantissa)) {
-    return 0.0f;
-  }
-
-  // Flating point range limit
-  if (st_fma.exponent > 127) {
-    return as_float(as_uint(INFINITY) | st_fma.sign);
-  }
-
-  // Flush denormals
-  if (st_fma.exponent <= -127) {
-    return as_float(st_fma.sign);
-  }
-
-  return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                  ((uint)st_fma.mantissa.lo & 0x7fffff));
-}
-_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, fma, float, float, float)
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)

 #ifdef cl_khr_fp16

--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@ -106,7 +106,6 @@ math/exp10.cl
 math/fabs.cl
 math/fdim.cl
 math/floor.cl
-math/clc_fma.cl
 math/fma.cl
 math/fmax.cl
 math/fmin.cl
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -123,23 +124,25 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
  int j = n & 0x3f;
  int m = n >> 6;

-  double r =
-      R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+  double r = R_LN10 * __clc_fma(-R_LOG10_2_BY_64_TL, dn,
+                                __clc_fma(-R_LOG10_2_BY_64_LD, dn, x));

  // 6 term tail of Taylor expansion of e^r
  double z2 =
-      r *
-      fma(r,
-          fma(r,
-              fma(r,
-                  fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
-                      0x1.5555555555555p-5),
-                  0x1.5555555555555p-3),
-              0x1.0000000000000p-1),
-          1.0);
+      r * __clc_fma(
+              r,
+              __clc_fma(r,
+                        __clc_fma(r,
+                                  __clc_fma(r,
+                                            __clc_fma(r, 0x1.6c16c16c16c17p-10,
+                                                      0x1.1111111111111p-7),
+                                            0x1.5555555555555p-5),
+                                  0x1.5555555555555p-3),
+                        0x1.0000000000000p-1),
+              1.0);

  double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-  z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+  z2 = __clc_fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;

  int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));

--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@ -124,7 +125,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {

    // Compute w * t in quad precision
    p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);

    // Subtract w * t from dx
    v = dx - p;
@ -144,7 +145,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
  int todd = lt & 1;

  p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
  v = dx - p;
  dx = v + (((dx - v) - p) - pp);
  i = dx < 0.0;
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -80,7 +81,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
  double ay = y * preadjust;

  // The post adjust may overflow, but this can't be avoided in any case
-  double r = sqrt(fma(ax, ax, ay * ay)) * postadjust;
+  double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;

  // If the difference in exponents between x and y is large
  double s = x + y;
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -283,26 +284,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
    double log_t = tv.s1;
    double f_inv = (log_h + log_t) * f;
    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
    double r = r1 + r2;

-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
        1.0 / 3.0);
    poly = poly * r * r * r;

    double hr1r1 = 0.5 * r1 * r1;
    double poly0h = r1 + hr1r1;
    double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;

    tv = USE_TABLE(powlog_tbl, index);
    log_h = tv.s0;
    log_t = tv.s1;

-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
    double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
    double resT_h = poly0h;

    double H = resT + resH;
@ -313,9 +317,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
    double y_tail = y - y_head;

-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
  }

  // Now calculate exp of (v,vt)
@ -339,21 +343,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
    double f2 = tv.s1;
    double f = f1 + f2;

-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
    double r2 = dn * lnof2_by_64_tail;
    double r = (r1 + r2) + vt;

-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);

-    expv = fma(f, q, f2) + f1;
+    expv = __clc_fma(f, q, f2) + f1;
    expv = ldexp(expv, m);

    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -267,26 +268,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
    double log_t = tv.s1;
    double f_inv = (log_h + log_t) * f;
    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
    double r = r1 + r2;

-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
        1.0 / 3.0);
    poly = poly * r * r * r;

    double hr1r1 = 0.5 * r1 * r1;
    double poly0h = r1 + hr1r1;
    double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;

    tv = USE_TABLE(powlog_tbl, index);
    log_h = tv.s0;
    log_t = tv.s1;

-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
    double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
    double resT_h = poly0h;

    double H = resT + resH;
@ -303,9 +307,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
    double y_tail1 = (double)nyt;
    y_tail = mask_2_24 ? y_tail1 : y_tail;

-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
  }

  // Now calculate exp of (v,vt)
@ -329,21 +333,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
    double f2 = tv.s1;
    double f = f1 + f2;

-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
    double r2 = dn * lnof2_by_64_tail;
    double r = (r1 + r2) + vt;

-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);

-    expv = fma(f, q, f2) + f1;
+    expv = __clc_fma(f, q, f2) + f1;
    expv = ldexp(expv, m);

    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -270,26 +271,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
    double log_t = tv.s1;
    double f_inv = (log_h + log_t) * f;
    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
    double r = r1 + r2;

-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
        1.0 / 3.0);
    poly = poly * r * r * r;

    double hr1r1 = 0.5 * r1 * r1;
    double poly0h = r1 + hr1r1;
    double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;

    tv = USE_TABLE(powlog_tbl, index);
    log_h = tv.s0;
    log_t = tv.s1;

-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
    double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
    double resT_h = poly0h;

    double H = resT + resH;
@ -300,9 +304,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
    double y_tail = y - y_head;

-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
  }

  // Now calculate exp of (v,vt)
@ -326,21 +330,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
    double f2 = tv.s1;
    double f = f1 + f2;

-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
    double r2 = dn * lnof2_by_64_tail;
    double r = (r1 + r2) + vt;

-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);

-    expv = fma(f, q, f2) + f1;
+    expv = __clc_fma(f, q, f2) + f1;
    expv = ldexp(expv, m);

    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@ -136,7 +137,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {

    // Compute w * t in quad precision
    p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);

    // Subtract w * t from dx
    v = dx - p;
@ -156,7 +157,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
  int todd = lt & 1;

  p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
  v = dx - p;
  dx = v + (((dx - v) - p) - pp);
  i = dx < 0.0;
@ -197,7 +198,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
  c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
  // we could use a conversion here instead since qsgn = +-1
  p = qsgn == 1 ? -1.0 : 1.0;
-  t = fma(y, p, x);
+  t = __clc_fma(y, p, x);
  ret = c ? t : ret;

  // We don't need anything special for |x| == 0
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@ -176,7 +177,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,

    // Compute w * t in quad precision
    p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);

    // Subtract w * t from dx
    v = dx - p;
@ -196,7 +197,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
  int todd = lt & 1;

  p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
  v = dx - p;
  dx = v + (((dx - v) - p) - pp);
  i = dx < 0.0;
@ -242,7 +243,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
  quo = c ? qsgn : quo;
  // we could use a conversion here instead since qsgn = +-1
  p = qsgn == 1 ? -1.0 : 1.0;
-  t = fma(y, p, x);
+  t = __clc_fma(y, p, x);
  ret = c ? t : ret;

  // We don't need anything special for |x| == 0
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@ -271,26 +272,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
    double log_t = tv.s1;
    double f_inv = (log_h + log_t) * f;
    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
    double r = r1 + r2;

-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
        1.0 / 3.0);
    poly = poly * r * r * r;

    double hr1r1 = 0.5 * r1 * r1;
    double poly0h = r1 + hr1r1;
    double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;

    tv = USE_TABLE(powlog_tbl, index);
    log_h = tv.s0;
    log_t = tv.s1;

-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
    double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
    double resT_h = poly0h;

    double H = resT + resH;
@ -303,11 +307,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {

    double fnyh = __clc_as_double(__clc_as_long(dny) & 0xfffffffffff00000);
    double fnyt = (double)(ny - (int)fnyh);
-    y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
+    y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny;

-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
  }

  // Now calculate exp of (v,vt)
@ -331,21 +335,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
    double f2 = tv.s1;
    double f = f1 + f2;

-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
    double r2 = dn * lnof2_by_64_tail;
    double r = (r1 + r2) + vt;

-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);

-    expv = fma(f, q, f2) + f1;
+    expv = __clc_fma(f, q, f2) + f1;
    expv = ldexp(expv, m);

    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
--- a/libclc/generic/lib/math/fma.cl
+++ b/libclc/generic/lib/math/fma.cl
@ -1,7 +1,20 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/math.h>

-#include "math/clc_fma.h"
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_fma, float, float, float)

-#define __CLC_BODY <fma.inc>
-#include <clc/math/gentype.inc>
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(double, fma, __clc_fma, double, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(half, fma, __clc_fma, half, half, half)
+
+#endif
--- a/libclc/generic/lib/math/fma.inc
+++ b/libclc/generic/lib/math/fma.inc
@ -1,7 +0,0 @@
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-#if __CLC_FPSIZE == 32 && HAVE_HW_FMA32() == 0
-	return __clc_sw_fma(a, b, c);
-#else
-	return __clc_fma(a, b, c);
-#endif
-}
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@ -24,6 +24,7 @@
 #include <clc/clc.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/integer/clc_mul_hi.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@ -121,10 +122,10 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) {

 _CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a,
                             float b, float bh, float bt) {
-  if (HAVE_HW_FMA32()) {
+  if (__CLC_HAVE_HW_FMA32()) {
    float ph = a * b;
    *hi = ph;
-    *lo = fma(a, b, -ph);
+    *lo = __clc_fma(a, b, -ph);
  } else {
    float ah = as_float(as_uint(a) & 0xfffff000U);
    float at = a - ah;
@ -310,9 +311,9 @@ _CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr,

  float rh, rt;

-  if (HAVE_HW_FMA32()) {
+  if (__CLC_HAVE_HW_FMA32()) {
    rh = q1 * pio2h;
-    rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+    rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
  } else {
    float q1h = as_float(as_uint(q1) & 0xfffff000);
    float q1t = q1 - q1h;
@ -349,7 +350,7 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
                                           private int *regn) {
  // How many pi/2 is x a multiple of?
  const double two_by_pi = 0x1.45f306dc9c883p-1;
-  double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
+  double dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));

  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
@ -357,11 +358,11 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,

  // Compute product of npi2 with 159 bits of 2/pi
  double p_hh = piby2_h * dnpi2;
-  double p_ht = fma(piby2_h, dnpi2, -p_hh);
+  double p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
  double p_mh = piby2_m * dnpi2;
-  double p_mt = fma(piby2_m, dnpi2, -p_mh);
+  double p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
  double p_th = piby2_t * dnpi2;
-  double p_tt = fma(piby2_t, dnpi2, -p_th);
+  double p_tt = __clc_fma(piby2_t, dnpi2, -p_th);

  // Reduce to 159 bits
  double ph = p_hh;
@ -469,13 +470,13 @@ _CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,

  // Exact multiply
  double f0h = p0 * x;
-  double f0l = fma(p0, x, -f0h);
+  double f0l = __clc_fma(p0, x, -f0h);
  double f1h = p1 * x;
-  double f1l = fma(p1, x, -f1h);
+  double f1l = __clc_fma(p1, x, -f1h);
  double f2h = p2 * x;
-  double f2l = fma(p2, x, -f2h);
+  double f2l = __clc_fma(p2, x, -f2h);
  double f3h = p3 * x;
-  double f3l = fma(p3, x, -f3h);
+  double f3l = __clc_fma(p3, x, -f3h);

  // Accumulate product into 4 doubles
  double s, t;
@ -533,7 +534,7 @@ _CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,
  const double p2t = 4967757600021510.0 / 0x1.0p+106;

  double rhi = f3 * p2h;
-  double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+  double rlo = __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));

  *r = rhi + rlo;
  *rr = rlo - (*r - rhi);
@ -581,15 +582,21 @@ _CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
  double r = 0.5 * x2;
  double t = 1.0 - r;

-  double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+  double sp = __clc_fma(
+      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);

  double cp =
-      t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
-                  x2, cc1),
-              x2 * x2, fma(x, xx, (1.0 - t) - r));
+      t +
+      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
+                                                        x2, cc4),
+                                              x2, cc3),
+                                    x2, cc2),
+                          x2, cc1),
+                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));

  double2 ret;
-  ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
+  ret.lo =
+      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
  ret.hi = cp;

  return ret;
--- a/libclc/spirv/lib/SOURCES
+++ b/libclc/spirv/lib/SOURCES
@ -41,7 +41,6 @@ subnormal_config.cl
 ../../generic/lib/math/exp2.cl
 ../../generic/lib/math/clc_exp10.cl
 ../../generic/lib/math/exp10.cl
-../../generic/lib/math/clc_fma.cl
 math/fma.cl
 ../../generic/lib/math/clc_fmod.cl
 ../../generic/lib/math/fmod.cl
--- a/libclc/spirv/lib/math/fma.cl
+++ b/libclc/spirv/lib/math/fma.cl
@ -1,11 +1,5 @@
 #include <clc/clc.h>
-#include <math/clc_fma.h>
+#include <clc/clcmacro.h>
+#include <clc/internal/math/clc_sw_fma.h>

-#define __CLC_BODY <fma.inc>
-#define __FLOAT_ONLY
-#include <clc/math/gentype.inc>
-
-bool __clc_runtime_has_hw_fma32()
-{
-    return false;
-}
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)
--- a/libclc/spirv/lib/math/fma.inc
+++ b/libclc/spirv/lib/math/fma.inc
@ -1,3 +0,0 @@
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-	return __clc_sw_fma(a, b, c);
-}
				`@ -0,0 +1 @@`
				`bool __clc_runtime_has_hw_fma32() { return false; }`