[libclc] Move sqrt to CLC library (#128748)

This is fairly straightforward for most targets. We use the element-wise sqrt builtin by default. We also remove a legacy pre-filtering of the input argument, which the intrinsic now officially handles. AMDGPU provides its own implementation of sqrt for double types. This commit moves this into the implementation of CLC sqrt. It uses weak linkage on the 'default' CLC sqrt to allow AMDGPU to only override the builtin for the types it cares about.
2025-04-15 22:16:30 +00:00 · 2025-02-27 12:30:24 +00:00 · 2025-02-27 12:30:24 +00:00 · 285b411e46
commit 285b411e46
parent 0865a3872c
12 changed files with 44 additions and 62 deletions
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@ -29,6 +29,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
  # CLC internal libraries
  clc/lib/generic/SOURCES;
  clc/lib/amdgcn/SOURCES;
+  clc/lib/amdgpu/SOURCES;
  clc/lib/clspv/SOURCES;
  clc/lib/spirv/SOURCES;
 )
--- a/libclc/amdgpu/lib/SOURCES
+++ b/libclc/amdgpu/lib/SOURCES
@ -10,4 +10,3 @@ math/half_log2.cl
 math/half_recip.cl
 math/half_rsqrt.cl
 math/half_sqrt.cl
-math/sqrt.cl
--- a/libclc/clc/include/clc/float/definitions.h
+++ b/libclc/clc/include/clc/float/definitions.h
@ -1,7 +1,6 @@
 #define MAXFLOAT 0x1.fffffep127f
 #define HUGE_VALF __builtin_huge_valf()
 #define INFINITY __builtin_inff()
-#define NAN __builtin_nanf("")

 #define FLT_DIG 6
 #define FLT_MANT_DIG 24
@ -13,6 +12,7 @@
 #define FLT_MAX MAXFLOAT
 #define FLT_MIN 0x1.0p-126f
 #define FLT_EPSILON 0x1.0p-23f
+#define FLT_NAN __builtin_nanf("")

 #define FP_ILOGB0 (-2147483647 - 1)
 #define FP_ILOGBNAN 2147483647
@ -46,6 +46,7 @@
 #define DBL_MAX 0x1.fffffffffffffp1023
 #define DBL_MIN 0x1.0p-1022
 #define DBL_EPSILON 0x1.0p-52
+#define DBL_NAN __builtin_nan("")

 #define M_E 0x1.5bf0a8b145769p+1
 #define M_LOG2E 0x1.71547652b82fep+0
@ -80,6 +81,7 @@
 #define HALF_MAX 0x1.ffcp15h
 #define HALF_MIN 0x1.0p-14h
 #define HALF_EPSILON 0x1.0p-10h
+#define HALF_NAN __builtin_nanf16("")

 #define M_LOG2E_H 0x1.714p+0h

--- a/libclc/clc/include/clc/math/clc_sqrt.h
+++ b/libclc/clc/include/clc/math/clc_sqrt.h
@ -1,8 +1,12 @@
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
+#ifndef __CLC_MATH_CLC_SQRT_H__
+#define __CLC_MATH_CLC_SQRT_H__

-#define __CLC_FUNCTION __clc_sqrt
 #define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sqrt
+
 #include <clc/math/gentype.inc>
+
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SQRT_H__
--- a/libclc/clc/lib/amdgpu/SOURCES
+++ b/libclc/clc/lib/amdgpu/SOURCES
@ -0,0 +1 @@
+math/clc_sqrt_fp64.cl
--- a/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
+++ b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
@ -20,52 +20,43 @@
 * THE SOFTWARE.
 */

-#include "math/clc_sqrt.h"
-#include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_DEFINE_UNARY_BUILTIN(half, sqrt, __clc_sqrt, half)
-
-#endif
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_ldexp.h>

 #ifdef cl_khr_fp64

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

 #ifdef __AMDGCN__
-  #define __clc_builtin_rsq __builtin_amdgcn_rsq
+#define __clc_builtin_rsq __builtin_amdgcn_rsq
 #else
-  #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
+#define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
 #endif

-_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
-
+_CLC_OVERLOAD _CLC_DEF double __clc_sqrt(double x) {
  uint vcc = x < 0x1p-767;
  uint exp0 = vcc ? 0x100 : 0;
  unsigned exp1 = vcc ? 0xffffff80 : 0;

-  double v01 = ldexp(x, exp0);
+  double v01 = __clc_ldexp(x, exp0);
  double v23 = __clc_builtin_rsq(v01);
  double v45 = v01 * v23;
  v23 = v23 * 0.5;

-  double v67 = fma(-v23, v45, 0.5);
-  v45 = fma(v45, v67, v45);
-  double v89 = fma(-v45, v45, v01);
-  v23 = fma(v23, v67, v23);
-  v45 = fma(v89, v23, v45);
-  v67 = fma(-v45, v45, v01);
-  v23 = fma(v67, v23, v45);
+  double v67 = __clc_fma(-v23, v45, 0.5);
+  v45 = __clc_fma(v45, v67, v45);
+  double v89 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v23, v67, v23);
+  v45 = __clc_fma(v89, v23, v45);
+  v67 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v67, v23, v45);

-  v23 = ldexp(v23, exp1);
-  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+  v23 = __clc_ldexp(v23, exp1);
+  return (x == __builtin_inf() || (x == 0.0)) ? v01 : v23;
 }

-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_sqrt, double);

 #endif
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@ -33,6 +33,7 @@ math/clc_nan.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
 math/clc_round.cl
+math/clc_sqrt.cl
 math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
--- a/libclc/clc/lib/generic/math/clc_sqrt.cl
+++ b/libclc/clc/lib/generic/math/clc_sqrt.cl
@ -20,14 +20,8 @@
 * THE SOFTWARE.
 */

-#include <clc/clc.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>

-// Map the llvm sqrt intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_llvm_intr_sqrt
-#define __CLC_INTRINSIC "llvm.sqrt"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
-#define __CLC_BODY <clc_sqrt_impl.inc>
+#define __CLC_BODY <clc_sqrt.inc>
 #include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/clc_sqrt_impl.inc
+++ b/libclc/generic/lib/math/clc_sqrt_impl.inc
@ -20,20 +20,7 @@
 * THE SOFTWARE.
 */

-#if __CLC_FPSIZE == 64
-#define __CLC_NAN __builtin_nan("")
-#define ZERO 0.0
-#elif __CLC_FPSIZE == 32
-#define __CLC_NAN NAN
-#define ZERO 0.0f
-#elif __CLC_FPSIZE == 16
-#define __CLC_NAN (half)NAN
-#define ZERO 0.0h
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) {
-  return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val);
+__attribute__((weak)) _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_sqrt(__CLC_GENTYPE val) {
+  return __builtin_elementwise_sqrt(val);
 }
-
-#undef __CLC_NAN
-#undef ZERO
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@ -179,7 +179,6 @@ math/sincos.cl
 math/sincos_helpers.cl
 math/sinh.cl
 math/sinpi.cl
-math/clc_sqrt.cl
 math/sqrt.cl
 math/clc_tan.cl
 math/tan.cl
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@ -27,6 +27,7 @@
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
+#include <clc/math/clc_sqrt.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@ -49,7 +50,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
  float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
  float fx = as_float(ux) * fi_exp;
  float fy = as_float(uy) * fi_exp;
-  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
+  retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;

  retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
  retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
@ -81,7 +82,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
  double ay = y * preadjust;

  // The post adjust may overflow, but this can't be avoided in any case
-  double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
+  double r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;

  // If the difference in exponents between x and y is large
  double s = x + y;
--- a/libclc/generic/lib/math/sqrt.cl
+++ b/libclc/generic/lib/math/sqrt.cl
@ -21,7 +21,9 @@
 */

 #include <clc/clc.h>
-#include "math/clc_sqrt.h"
+#include <clc/math/clc_sqrt.h>

-#define __CLC_FUNCTION sqrt
-#include <clc/math/unary_builtin.inc>
+#define FUNCTION sqrt
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/math/gentype.inc>