[libclc] Move sinpi/cospi/tanpi to the CLC library (#133889)

Additionally, these builtins are now vectorized. This also moves the native_recip and native_divide builtins as they are used by the tanpi builtin.
2025-04-15 22:06:32 +00:00 · 2025-04-01 12:03:21 +01:00 · 2025-04-01 12:03:21 +01:00 · 13a313fe58
commit 13a313fe58
parent 1d9ad99305
36 changed files with 797 additions and 568 deletions
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@ -265,11 +265,13 @@ endif()
 set_source_files_properties(
  # CLC builtins
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_cos.cl
+  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_divide.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp2.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log10.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log2.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log.cl
+  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_recip.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_rsqrt.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sin.cl
  ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sqrt.cl
--- a/libclc/clc/include/clc/math/clc_cospi.h
+++ b/libclc/clc/include/clc/math/clc_cospi.h
@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_COSPI_H__
+#define __CLC_MATH_CLC_COSPI_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_cospi
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_COSPI_H__
--- a/libclc/clc/include/clc/math/clc_native_divide.h
+++ b/libclc/clc/include/clc/math/clc_native_divide.h
@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_NATIVE_DIVIDE_H__
+#define __CLC_MATH_CLC_NATIVE_DIVIDE_H__
+
+#define __FLOAT_ONLY
+#define __CLC_FUNCTION __clc_native_divide
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+#undef __FLOAT_ONLY
+
+#endif // __CLC_MATH_CLC_NATIVE_DIVIDE_H__
--- a/libclc/clc/include/clc/math/clc_native_recip.h
+++ b/libclc/clc/include/clc/math/clc_native_recip.h
@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_NATIVE_RECIP_H__
+#define __CLC_MATH_CLC_NATIVE_RECIP_H__
+
+#define __FLOAT_ONLY
+#define __CLC_FUNCTION __clc_native_recip
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+#undef __FLOAT_ONLY
+
+#endif // __CLC_MATH_CLC_NATIVE_RECIP_H__
--- a/libclc/clc/include/clc/math/clc_sincos_helpers.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
@ -10,6 +10,8 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
                                                      __CLC_FLOATN y);
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
                                                      __CLC_FLOATN y);
+_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
+                                                      __CLC_INTN regn);

 _CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
                                                       private __CLC_FLOATN *rr,
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.h
+++ b/libclc/clc/include/clc/math/clc_sincos_piby4.h
@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc/math/clc_sincos_piby4.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_piby4.inc
@ -0,0 +1,174 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
+_CLC_INLINE _CLC_OVERLOAD void
+__clc_sincos_piby4(__CLC_GENTYPE x, private __CLC_GENTYPE *sinval,
+                   private __CLC_GENTYPE *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const __CLC_GENTYPE sc1 = -0.166666666638608441788607926e0F;
+  const __CLC_GENTYPE sc2 = 0.833333187633086262120839299e-2F;
+  const __CLC_GENTYPE sc3 = -0.198400874359527693921333720e-3F;
+  const __CLC_GENTYPE sc4 = 0.272500015145584081596826911e-5F;
+
+  const __CLC_GENTYPE cc1 = 0.41666666664325175238031e-1F;
+  const __CLC_GENTYPE cc2 = -0.13888887673175665567647e-2F;
+  const __CLC_GENTYPE cc3 = 0.24800600878112441958053e-4F;
+  const __CLC_GENTYPE cc4 = -0.27301013343179832472841e-6F;
+
+  __CLC_GENTYPE x2 = x * x;
+
+  *sinval = __clc_mad(
+      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+      x);
+  *cosval = __clc_mad(
+      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+      __clc_mad(x2, -0.5f, 1.0f));
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_INLINE _CLC_OVERLOAD void
+__clc_sincos_piby4(__CLC_GENTYPE x, __CLC_GENTYPE xx,
+                   private __CLC_GENTYPE *sinval,
+                   private __CLC_GENTYPE *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  //                      = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we add a correction
+  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+  // is an approximation to cos(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  //                      = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we subtract a correction
+  // term g(x,xx) = x*xx to the result, where g(x,xx)
+  // is an approximation to sin(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  const __CLC_GENTYPE sc1 = -0.166666666666666646259241729;
+  const __CLC_GENTYPE sc2 = 0.833333333333095043065222816e-2;
+  const __CLC_GENTYPE sc3 = -0.19841269836761125688538679e-3;
+  const __CLC_GENTYPE sc4 = 0.275573161037288022676895908448e-5;
+  const __CLC_GENTYPE sc5 = -0.25051132068021699772257377197e-7;
+  const __CLC_GENTYPE sc6 = 0.159181443044859136852668200e-9;
+
+  const __CLC_GENTYPE cc1 = 0.41666666666666665390037e-1;
+  const __CLC_GENTYPE cc2 = -0.13888888888887398280412e-2;
+  const __CLC_GENTYPE cc3 = 0.248015872987670414957399e-4;
+  const __CLC_GENTYPE cc4 = -0.275573172723441909470836e-6;
+  const __CLC_GENTYPE cc5 = 0.208761463822329611076335e-8;
+  const __CLC_GENTYPE cc6 = -0.113826398067944859590880e-10;
+
+  __CLC_GENTYPE x2 = x * x;
+  __CLC_GENTYPE x3 = x2 * x;
+  __CLC_GENTYPE r = (__CLC_GENTYPE)0.5 * x2;
+  __CLC_GENTYPE t = (__CLC_GENTYPE)1.0 - r;
+
+  __CLC_GENTYPE sp = __clc_fma(
+      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+  __CLC_GENTYPE cp =
+      t +
+      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
+                                                        x2, cc4),
+                                              x2, cc3),
+                                    x2, cc2),
+                          x2, cc1),
+                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
+
+  *sinval =
+      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
+  *cosval = cp;
+}
+
+_CLC_INLINE _CLC_OVERLOAD void __clc_tan_piby4(__CLC_GENTYPE x,
+                                               __CLC_GENTYPE xx,
+                                               private __CLC_GENTYPE *leadval,
+                                               private __CLC_GENTYPE *tailval) {
+  // 0x3fe921fb54442d18
+  const __CLC_GENTYPE piby4_lead = 7.85398163397448278999e-01;
+  // 0x3c81a62633145c06
+  const __CLC_GENTYPE piby4_tail = 3.06161699786838240164e-17;
+
+  // In order to maintain relative precision transform using the identity:
+  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
+  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
+
+  __CLC_LONGN ca = x > 0.68;
+  __CLC_LONGN cb = x < -0.68;
+  __CLC_GENTYPE transform = ca ? 1.0 : 0.0;
+  transform = cb ? -1.0 : transform;
+
+  __CLC_GENTYPE tx = __clc_fma(-transform, x, piby4_lead) +
+                     __clc_fma(-transform, xx, piby4_tail);
+  __CLC_LONGN c = ca | cb;
+  x = c ? tx : x;
+  xx = c ? 0.0 : xx;
+
+  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
+  __CLC_GENTYPE t1 = x;
+  __CLC_GENTYPE r = __clc_fma(2.0, x * xx, x * x);
+
+  __CLC_GENTYPE a = __clc_fma(r,
+                              __clc_fma(r, 0.224044448537022097264602535574e-3,
+                                        -0.229345080057565662883358588111e-1),
+                              0.372379159759792203640806338901e0);
+
+  __CLC_GENTYPE b =
+      __clc_fma(r,
+                __clc_fma(r,
+                          __clc_fma(r, -0.232371494088563558304549252913e-3,
+                                    0.260656620398645407524064091208e-1),
+                          -0.515658515729031149329237816945e0),
+                0.111713747927937668539901657944e1);
+
+  __CLC_GENTYPE t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
+
+  __CLC_GENTYPE tp = t1 + t2;
+
+  // Compute -1.0/(t1 + t2) accurately
+  __CLC_GENTYPE z1 =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
+  __CLC_GENTYPE z2 = t2 - (z1 - t1);
+  __CLC_GENTYPE trec = -MATH_RECIP(tp);
+  __CLC_GENTYPE trec_top =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
+
+  __CLC_GENTYPE tpr = __clc_fma(
+      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
+
+  __CLC_GENTYPE tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
+  __CLC_GENTYPE tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
+
+  *leadval = c ? tpt : tp;
+  *tailval = c ? tptr : tpr;
+}
+
+#endif
--- a/libclc/clc/include/clc/math/clc_sinpi.h
+++ b/libclc/clc/include/clc/math/clc_sinpi.h
@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SINPI_H__
+#define __CLC_MATH_CLC_SINPI_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sinpi
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SINPI_H__
--- a/libclc/clc/include/clc/math/clc_tanpi.h
+++ b/libclc/clc/include/clc/math/clc_tanpi.h
@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_TANPI_H__
+#define __CLC_MATH_CLC_TANPI_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_tanpi
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_TANPI_H__
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@ -30,6 +30,7 @@ math/clc_atanh.cl
 math/clc_atanpi.cl
 math/clc_ceil.cl
 math/clc_copysign.cl
+math/clc_cospi.cl
 math/clc_ep_log.cl
 math/clc_fabs.cl
 math/clc_fma.cl
@ -46,12 +47,14 @@ math/clc_mad.cl
 math/clc_modf.cl
 math/clc_nan.cl
 math/clc_native_cos.cl
+math/clc_native_divide.cl
 math/clc_native_exp.cl
 math/clc_native_exp2.cl
 math/clc_native_log.cl
 math/clc_native_log10.cl
 math/clc_native_log2.cl
 math/clc_native_rsqrt.cl
+math/clc_native_recip.cl
 math/clc_native_sin.cl
 math/clc_native_sqrt.cl
 math/clc_nextafter.cl
@ -65,9 +68,11 @@ math/clc_rootn.cl
 math/clc_round.cl
 math/clc_rsqrt.cl
 math/clc_sincos_helpers.cl
+math/clc_sinpi.cl
 math/clc_sqrt.cl
 math/clc_sw_fma.cl
 math/clc_tables.cl
+math/clc_tanpi.cl
 math/clc_trunc.cl
 relational/clc_all.cl
 relational/clc_any.cl
--- a/libclc/clc/lib/generic/math/clc_cospi.cl
+++ b/libclc/clc/lib/generic/math/clc_cospi.cl
@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc_cospi.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_cospi.inc
+++ b/libclc/clc/lib/generic/math/clc_cospi.inc
@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_INTN ix = __CLC_AS_INTN(absx);
+  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_INTN xodd = (iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0;
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_INTN ir = QNANBITPATT_SP32;
+
+  // 2^24 <= |x| < Inf, the result is always even integer
+  ir = ix < PINFBITPATT_SP32 ? 0x3f800000 : ir;
+
+  // 2^23 <= |x| < 2^24, the result is always integer
+  ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0f - r;
+  __CLC_INTN e = 1;
+  __CLC_INTN s = xodd ^ (__CLC_INTN)0x80000000;
+
+  // r <= 0.75
+  __CLC_INTN c = r <= 0.75f;
+  a = c ? r - 0.5f : a;
+  e = c ? 0 : e;
+
+  // r < 0.5
+  c = r < 0.5f;
+  a = c ? 0.5f - r : a;
+  s = c ? xodd : s;
+
+  // r <= 0.25
+  c = r <= 0.25f;
+  a = c ? r : a;
+  e = c ? 1 : e;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval);
+  __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? cosval : sinval);
+
+  ir = ix < 0x4b000000 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_LONGN ix = __CLC_AS_LONGN(absx);
+  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_LONGN xodd =
+      (iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L;
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_LONGN ir = QNANBITPATT_DP64;
+
+  // 2^53 <= |x| < Inf, the result is always even integer
+  ir = ix < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ir;
+
+  // 2^52 <= |x| < 2^53, the result is always integer
+  ir = absx < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir;
+
+  // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0 - r;
+  __CLC_LONGN e = 1;
+  __CLC_LONGN s = xodd ^ (__CLC_LONGN)0x8000000000000000L;
+
+  // r <= 0.75
+  __CLC_LONGN c = r <= 0.75;
+  __CLC_GENTYPE t = r - 0.5;
+  a = c ? t : a;
+  e = c ? 0 : e;
+
+  // r < 0.5
+  c = r < 0.5;
+  t = 0.5 - r;
+  a = c ? t : a;
+  s = c ? xodd : s;
+
+  // r <= 0.25
+  c = r <= 0.25;
+  a = c ? r : a;
+  e = c ? 1 : e;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(a * M_PI, 0.0, &sinval, &cosval);
+  __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval);
+
+  ir = absx < 0x1.0p+52 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_cospi(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
--- a/libclc/clc/lib/generic/math/clc_native_divide.cl
+++ b/libclc/clc/lib/generic/math/clc_native_divide.cl
@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/internal/clc.h>
+
+#define __FLOAT_ONLY
+#define __CLC_BODY <clc_native_divide.inc>
+
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_native_divide.inc
+++ b/libclc/clc/lib/generic/math/clc_native_divide.inc
@ -0,0 +1,12 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_divide(__CLC_GENTYPE x,
+                                                         __CLC_GENTYPE y) {
+  return x / y;
+}
--- a/libclc/clc/lib/generic/math/clc_native_recip.cl
+++ b/libclc/clc/lib/generic/math/clc_native_recip.cl
@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//

-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_divide(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-  return x / y;
-}
+#include <clc/internal/clc.h>
+
+#define __FLOAT_ONLY
+#define __CLC_BODY <clc_native_recip.inc>
+
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_native_recip.inc
+++ b/libclc/clc/lib/generic/math/clc_native_recip.inc
@ -6,6 +6,6 @@
 //
 //===----------------------------------------------------------------------===//

-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_recip(__CLC_GENTYPE val) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_recip(__CLC_GENTYPE val) {
  return 1.0f / val;
 }
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@ -12,6 +12,7 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
+#include <clc/math/clc_native_divide.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>

--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@ -74,6 +74,25 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
  return ret;
 }

+_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
+                                                      __CLC_INTN regn) {
+  // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
+  __CLC_FLOATN r = x * x;
+
+  __CLC_FLOATN a =
+      __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+
+  __CLC_FLOATN b = __clc_mad(
+      r,
+      __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+      1.15588821434688393452299f);
+
+  __CLC_FLOATN t = __clc_mad(x * r, __clc_native_divide(a, b), x);
+  __CLC_FLOATN tr = -MATH_RECIP(t);
+
+  return regn & 1 ? tr : t;
+}
+
 _CLC_DEF _CLC_OVERLOAD void __clc_fullMulS(private __CLC_FLOATN *hi,
                                           private __CLC_FLOATN *lo,
                                           __CLC_FLOATN a, __CLC_FLOATN b,
--- a/libclc/clc/lib/generic/math/clc_sinpi.cl
+++ b/libclc/clc/lib/generic/math/clc_sinpi.cl
@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc_sinpi.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_sinpi.inc
+++ b/libclc/clc/lib/generic/math/clc_sinpi.inc
@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
+  __CLC_INTN ix = __CLC_AS_INTN(x);
+  __CLC_INTN xsgn = ix & (__CLC_INTN)0x80000000;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_INTN xodd =
+      xsgn ^ ((iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_INTN ir = QNANBITPATT_SP32;
+
+  // 2^23 <= |x| < Inf, the result is always integer
+  ir = ix < PINFBITPATT_SP32 ? xsgn : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0f - r;
+  __CLC_INTN e = 0;
+
+  // r <= 0.75
+  __CLC_INTN c = r <= 0.75f;
+  a = c ? r - 0.5f : a;
+  e = c ? 1 : e;
+
+  // r < 0.5
+  c = r < 0.5f;
+  a = c ? 0.5f - r : a;
+
+  // 0 < r <= 0.25
+  c = r <= 0.25f;
+  a = c ? r : a;
+  e = c ? 0 : e;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval);
+  __CLC_INTN jr = xodd ^ __CLC_AS_INTN(e != 0 ? cosval : sinval);
+
+  ir = ix < 0x4b000000 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
+  __CLC_LONGN ix = __CLC_AS_LONGN(x);
+  __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_LONGN xodd =
+      xsgn ^
+      ((iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_LONGN ir = QNANBITPATT_DP64;
+
+  // 2^23 <= |x| < Inf, the result is always integer
+  ir = ix < PINFBITPATT_DP64 ? xsgn : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0 - r;
+  __CLC_LONGN e = 0;
+
+  //  r <= 0.75
+  __CLC_LONGN c = r <= 0.75;
+  __CLC_GENTYPE t = r - 0.5;
+  a = c ? t : a;
+  e = c ? 1 : e;
+
+  // r < 0.5
+  c = r < 0.5;
+  t = 0.5 - r;
+  a = c ? t : a;
+
+  // r <= 0.25
+  c = r <= 0.25;
+  a = c ? r : a;
+  e = c ? 0 : e;
+
+  __CLC_GENTYPE api = a * M_PI;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(api, 0.0, &sinval, &cosval);
+  __CLC_LONGN jr = xodd ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval);
+
+  ir = absx < 0x1.0p+52 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_sinpi(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
--- a/libclc/clc/lib/generic/math/clc_tanpi.cl
+++ b/libclc/clc/lib/generic/math/clc_tanpi.cl
@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_native_recip.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc_tanpi.inc>
+#include <clc/math/gentype.inc>
--- a/libclc/clc/lib/generic/math/clc_tanpi.inc
+++ b/libclc/clc/lib/generic/math/clc_tanpi.inc
@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
+  __CLC_INTN ix = __CLC_AS_INTN(x);
+  __CLC_INTN xsgn = ix & (__CLC_INTN)SIGNBIT_SP32;
+  __CLC_INTN xnsgn = xsgn ^ (__CLC_INTN)SIGNBIT_SP32;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_INTN xodd = xsgn ^ __CLC_AS_INTN((iax & 0x1) != 0 ? SIGNBIT_SP32 : 0);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_INTN ir = QNANBITPATT_SP32;
+
+  // 2^24 <= |x| < Inf, the result is always even integer
+  ir = ix < PINFBITPATT_SP32 ? xsgn : ir;
+
+  // 2^23 <= |x| < 2^24, the result is always integer
+  ir = ix < 0x4b800000 ? xodd : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0f - r;
+  __CLC_INTN e = 0;
+  __CLC_INTN s = xnsgn;
+
+  // r <= 0.75
+  __CLC_INTN c = r <= 0.75f;
+  a = c ? r - 0.5f : a;
+  e = c ? 1 : e;
+  s = c ? xsgn : s;
+
+  // r < 0.5
+  c = r < 0.5f;
+  a = c ? 0.5f - r : a;
+  s = c ? xnsgn : s;
+
+  // 0 < r <= 0.25
+  c = r <= 0.25f;
+  a = c ? r : a;
+  e = c ? 0 : e;
+  s = c ? xsgn : s;
+
+  __CLC_GENTYPE t = __clc_tanf_piby4(a * M_PI_F, 0);
+  __CLC_GENTYPE tr = -__clc_native_recip(t);
+  __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? tr : t);
+
+  jr = r == 0.5f ? xodd | 0x7f800000 : jr;
+
+  ir = ix < 0x4b000000 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
+  __CLC_LONGN ix = __CLC_AS_LONGN(x);
+  __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L;
+  __CLC_LONGN xnsgn = xsgn ^ (__CLC_LONGN)0x8000000000000000L;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_LONGN xodd =
+      xsgn ^ __CLC_AS_LONGN((iax & 0x1) != 0 ? 0x8000000000000000L : 0L);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_LONGN ir = QNANBITPATT_DP64;
+
+  // 2^53 <= |x| < Inf, the result is always even integer
+  ir = ix < PINFBITPATT_DP64 ? xsgn : ir;
+
+  // 2^52 <= |x| < 2^53, the result is always integer
+  ir = ix < 0x4340000000000000L ? xodd : ir;
+
+  // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0 - r;
+  __CLC_LONGN e = 0;
+  __CLC_LONGN s = xnsgn;
+
+  // r <= 0.75
+  __CLC_LONGN c = r <= 0.75;
+  __CLC_GENTYPE t = r - 0.5;
+  a = c ? t : a;
+  e = c ? 1 : e;
+  s = c ? xsgn : s;
+
+  // r < 0.5
+  c = r < 0.5;
+  t = 0.5 - r;
+  a = c ? t : a;
+  s = c ? xnsgn : s;
+
+  // r <= 0.25
+  c = r <= 0.25;
+  a = c ? r : a;
+  e = c ? 0 : e;
+  s = c ? xsgn : s;
+
+  __CLC_GENTYPE api = a * M_PI;
+  __CLC_GENTYPE lo, hi;
+  __clc_tan_piby4(api, 0.0, &lo, &hi);
+  __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? hi : lo);
+
+  __CLC_LONGN si = xodd | 0x7ff0000000000000L;
+  jr = r == 0.5 ? si : jr;
+
+  ir = ix < 0x4330000000000000L ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_tanpi(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
--- a/libclc/clspv/lib/SOURCES
+++ b/libclc/clspv/lib/SOURCES
@ -18,7 +18,6 @@ subnormal_config.cl
 ../../generic/lib/math/cbrt.cl
 ../../generic/lib/math/clc_exp10.cl
 ../../generic/lib/math/clc_tan.cl
-../../generic/lib/math/clc_tanpi.cl
 ../../generic/lib/math/cos.cl
 ../../generic/lib/math/cosh.cl
 ../../generic/lib/math/cospi.cl
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@ -174,7 +174,6 @@ math/sqrt.cl
 math/clc_tan.cl
 math/tan.cl
 math/tanh.cl
-math/clc_tanpi.cl
 math/tanpi.cl
 math/tgamma.cl
 math/trunc.cl
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@ -35,7 +35,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_tan(float x) {
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_tan, float);

 #ifdef cl_khr_fp64
-#include "sincosD_piby4.h"
+#include <clc/math/clc_sincos_piby4.h>

 _CLC_DEF _CLC_OVERLOAD double __clc_tan(double x) {
  double y = __clc_fabs(x);
@ -48,9 +48,10 @@ _CLC_DEF _CLC_OVERLOAD double __clc_tan(double x) {
  else
    __clc_remainder_piby2_large(y, &r, &rr, &regn);

-  double2 tt = __clc_tan_piby4(r, rr);
+  double lead, tail;
+  __clc_tan_piby4(r, rr, &lead, &tail);

-  int2 t = as_int2(regn & 1 ? tt.y : tt.x);
+  int2 t = as_int2(regn & 1 ? tail : lead);
  t.hi ^= (x < 0.0) << 31;

  return __clc_isnan(x) || __clc_isinf(x) ? as_double(QNANBITPATT_DP64)
--- a/libclc/generic/lib/math/clc_tanpi.cl
+++ b/libclc/generic/lib/math/clc_tanpi.cl
@ -1,132 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "sincos_helpers.h"
-#include <clc/clc.h>
-#include <clc/clcmacro.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-
-_CLC_DEF _CLC_OVERLOAD float __clc_tanpi(float x)
-{
-    int ix = as_int(x);
-    int xsgn = ix & 0x80000000;
-    int xnsgn = xsgn ^ 0x80000000;
-    ix ^= xsgn;
-    float ax = as_float(ix);
-    int iax = (int)ax;
-    float r = ax - iax;
-    int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0);
-
-    // Initialize with return for +-Inf and NaN
-    int ir = 0x7fc00000;
-
-    // 2^24 <= |x| < Inf, the result is always even integer
-    ir = ix < 0x7f800000 ? xsgn : ir;
-
-    // 2^23 <= |x| < 2^24, the result is always integer
-    ir = ix < 0x4b800000 ? xodd : ir;
-
-    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-    // r < 1.0
-    float a = 1.0f - r;
-    int e = 0;
-    int s = xnsgn;
-
-    // r <= 0.75
-    int c = r <= 0.75f;
-    a = c ? r - 0.5f : a;
-    e = c ? 1 : e;
-    s = c ? xsgn : s;
-
-    // r < 0.5
-    c = r < 0.5f;
-    a = c ? 0.5f - r : a;
-    s = c ? xnsgn : s;
-
-    // 0 < r <= 0.25
-    c = r <= 0.25f;
-    a = c ? r : a;
-    e = c ? 0 : e;
-    s = c ? xsgn : s;
-
-    float t = __clc_tanf_piby4(a * M_PI_F, 0);
-    float tr = -native_recip(t);
-    int jr = s ^ as_int(e ? tr : t);
-
-    jr = r == 0.5f ? xodd | 0x7f800000 : jr;
-
-    ir = ix < 0x4b000000 ? jr : ir;
-
-    return as_float(ir);
-}
-_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_tanpi, float);
-
-#ifdef cl_khr_fp64
-#include "sincosD_piby4.h"
-
-_CLC_DEF _CLC_OVERLOAD double __clc_tanpi(double x)
-{
-    long ix = as_long(x);
-    long xsgn = ix & 0x8000000000000000L;
-    long xnsgn = xsgn ^ 0x8000000000000000L;
-    ix ^= xsgn;
-    double ax = as_double(ix);
-    long iax = (long)ax;
-    double r = ax - iax;
-    long xodd = xsgn ^ (iax & 0x1 ? 0x8000000000000000L : 0L);
-
-    // Initialize with return for +-Inf and NaN
-    long ir = 0x7ff8000000000000L;
-
-    // 2^53 <= |x| < Inf, the result is always even integer
-    ir = ix < 0x7ff0000000000000L ? xsgn : ir;
-
-    // 2^52 <= |x| < 2^53, the result is always integer
-    ir = ix < 0x4340000000000000L ? xodd : ir;
-
-    // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval
-
-    // r < 1.0
-    double a = 1.0 - r;
-    int e = 0;
-    long s = xnsgn;
-
-    // r <= 0.75
-    int c = r <= 0.75;
-    double t = r - 0.5;
-    a = c ? t : a;
-    e = c ? 1 : e;
-    s = c ? xsgn : s;
-
-    // r < 0.5
-    c = r < 0.5;
-    t = 0.5 - r;
-    a = c ? t : a;
-    s = c ? xnsgn : s;
-
-    // r <= 0.25
-    c = r <= 0.25;
-    a = c ? r : a;
-    e = c ? 0 : e;
-    s = c ? xsgn : s;
-
-    double api = a * M_PI;
-    double2 tt = __clc_tan_piby4(api, 0.0);
-    long jr = s ^ as_long(e ? tt.hi : tt.lo);
-
-    long si = xodd | 0x7ff0000000000000L;
-    jr = r == 0.5 ? si : jr;
-
-    ir = ix < 0x4330000000000000L ? jr : ir;
-
-    return as_double(ir);
-}
-_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_tanpi, double);
-#endif
--- a/libclc/generic/lib/math/cospi.cl
+++ b/libclc/generic/lib/math/cospi.cl
@ -7,124 +7,9 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
-#include <clc/clcmacro.h>
-#include <clc/math/math.h>
+#include <clc/math/clc_cospi.h>

-#include "sincos_helpers.h"
-#include "sincospiF_piby4.h"
-#ifdef cl_khr_fp64
-#include "sincosD_piby4.h"
-#endif
+#define FUNCTION cospi
+#define __CLC_BODY <clc/shared/unary_def.inc>

-_CLC_OVERLOAD _CLC_DEF float cospi(float x)
-{
-    int ix = as_int(x) & 0x7fffffff;
-    float ax = as_float(ix);
-    int iax = (int)ax;
-    float r = ax - iax;
-    int xodd = iax & 0x1 ? 0x80000000 : 0;
-
-    // Initialize with return for +-Inf and NaN
-    int ir = 0x7fc00000;
-
-    // 2^24 <= |x| < Inf, the result is always even integer
-    ir = ix < 0x7f800000 ? 0x3f800000 : ir;
-
-    // 2^23 <= |x| < 2^24, the result is always integer
-    ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir;
-
-    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-    // r < 1.0
-    float a = 1.0f - r;
-    int e = 1;
-    int s = xodd ^ 0x80000000;
-
-    // r <= 0.75
-    int c = r <= 0.75f;
-    a = c ? r - 0.5f : a;
-    e = c ? 0 : e;
-
-    // r < 0.5
-    c = r < 0.5f;
-    a = c ? 0.5f - r : a;
-    s = c ? xodd : s;
-
-    // r <= 0.25
-    c = r <= 0.25f;
-    a = c ? r : a;
-    e = c ? 1 : e;
-
-    float2 t = __libclc__sincosf_piby4(a * M_PI_F);
-    int jr = s ^ as_int(e ? t.hi : t.lo);
-
-    ir = ix < 0x4b000000 ? jr : ir;
-
-    return as_float(ir);
-}
-
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cospi, float);
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double cospi(double x) {
-
-    long ix = as_long(x) & 0x7fffffffffffffffL;
-    double ax = as_double(ix);
-    long iax = (long)ax;
-    double r = ax - (double)iax;
-    long xodd = iax & 0x1L ? 0x8000000000000000L : 0L;
-
-    // Initialize with return for +-Inf and NaN
-    long ir = 0x7ff8000000000000L;
-
-    // 2^53 <= |x| < Inf, the result is always even integer
-    ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir;
-
-    // 2^52 <= |x| < 2^53, the result is always integer
-    ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir;
-
-    // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
-
-    // r < 1.0
-    double a = 1.0 - r;
-    int e = 1;
-    long s = xodd ^ 0x8000000000000000L;
-
-    // r <= 0.75
-    int c = r <= 0.75;
-    double t = r - 0.5;
-    a = c ? t : a;
-    e = c ? 0 : e;
-
-    // r < 0.5
-    c = r < 0.5;
-    t = 0.5 - r;
-    a = c ? t : a;
-    s = c ? xodd : s;
-
-    // r <= 0.25
-    c = r <= 0.25;
-    a = c ? r : a;
-    e = c ? 1 : e;
-
-    double2 sc = __libclc__sincos_piby4(a * M_PI, 0.0);
-    long jr = s ^ as_long(e ? sc.hi : sc.lo);
-
-    ir = ax < 0x1.0p+52 ? jr : ir;
-
-    return as_double(ir);
-}
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double);
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN_FP16(cospi)
-
-#endif
+#include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/native_divide.cl
+++ b/libclc/generic/lib/math/native_divide.cl
@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
+#include <clc/math/clc_native_divide.h>

-#define __CLC_BODY <native_divide.inc>
 #define __FLOAT_ONLY
+#define FUNCTION native_divide
+#define __CLC_BODY <clc/shared/binary_def.inc>
+
 #include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/native_recip.cl
+++ b/libclc/generic/lib/math/native_recip.cl
@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
+#include <clc/math/clc_native_recip.h>

-#define __CLC_BODY <native_recip.inc>
 #define __FLOAT_ONLY
+#define FUNCTION native_recip
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
 #include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/sincosD_piby4.h
+++ b/libclc/generic/lib/math/sincosD_piby4.h
@ -1,119 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_INLINE double2
-__libclc__sincos_piby4(double x, double xx)
-{
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    //                      = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we add a correction
-    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-    // is an approximation to cos(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    //                      = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we subtract a correction
-    // term g(x,xx) = x*xx to the result, where g(x,xx)
-    // is an approximation to sin(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    const double sc1 = -0.166666666666666646259241729;
-    const double sc2 =  0.833333333333095043065222816e-2;
-    const double sc3 = -0.19841269836761125688538679e-3;
-    const double sc4 =  0.275573161037288022676895908448e-5;
-    const double sc5 = -0.25051132068021699772257377197e-7;
-    const double sc6 =  0.159181443044859136852668200e-9;
-
-    const double cc1 =  0.41666666666666665390037e-1;
-    const double cc2 = -0.13888888888887398280412e-2;
-    const double cc3 =  0.248015872987670414957399e-4;
-    const double cc4 = -0.275573172723441909470836e-6;
-    const double cc5 =  0.208761463822329611076335e-8;
-    const double cc6 = -0.113826398067944859590880e-10;
-
-    double x2 = x * x;
-    double x3 = x2 * x;
-    double r = 0.5 * x2;
-    double t = 1.0 - r;
-
-    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
-                        x2*x2, fma(x, xx, (1.0 - t) - r));
-
-    double2 ret;
-    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
-    ret.hi = cp;
-
-    return ret;
-}
-
-_CLC_INLINE double2
-__clc_tan_piby4(double x, double xx)
-{
-    const double piby4_lead = 7.85398163397448278999e-01; // 0x3fe921fb54442d18
-    const double piby4_tail = 3.06161699786838240164e-17; // 0x3c81a62633145c06
-
-    // In order to maintain relative precision transform using the identity:
-    // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
-    // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
-
-    int ca = x >  0.68;
-    int cb = x < -0.68;
-    double transform = ca ?  1.0 : 0.0;
-    transform = cb ? -1.0 : transform;
-
-    double tx = fma(-transform, x, piby4_lead) + fma(-transform, xx, piby4_tail);
-    int c = ca | cb;
-    x = c ? tx : x;
-    xx = c ? 0.0 : xx;
-
-    // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
-    double t1 = x;
-    double r = fma(2.0, x*xx, x*x);
-
-    double a = fma(r,
-                   fma(r, 0.224044448537022097264602535574e-3, -0.229345080057565662883358588111e-1),
-                   0.372379159759792203640806338901e0);
-
-    double b = fma(r,
-                   fma(r,
-                       fma(r, -0.232371494088563558304549252913e-3, 0.260656620398645407524064091208e-1),
-                       -0.515658515729031149329237816945e0),
-                   0.111713747927937668539901657944e1);
-
-    double t2 = fma(MATH_DIVIDE(a, b), x*r, xx);
-
-    double tp = t1 + t2;
-
-    // Compute -1.0/(t1 + t2) accurately
-    double z1 = as_double(as_long(tp) & 0xffffffff00000000L);
-    double z2 = t2 - (z1 - t1);
-    double trec = -MATH_RECIP(tp);
-    double trec_top = as_double(as_long(trec) & 0xffffffff00000000L);
-
-    double tpr = fma(fma(trec_top, z2, fma(trec_top, z1, 1.0)), trec, trec_top);
-
-    double tpt = transform * (1.0 - MATH_DIVIDE(2.0*tp, 1.0 + tp));
-    double tptr = transform * (MATH_DIVIDE(2.0*tp, tp - 1.0) - 1.0);
-
-    double2 ret;
-    ret.lo = c ? tpt : tp;
-    ret.hi = c ? tptr : tpr;
-    return ret;
-}
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@ -17,31 +17,13 @@
 #include <clc/math/tables.h>
 #include <clc/shared/clc_max.h>

-#define bytealign(src0, src1, src2)                                            \
-  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
-
-_CLC_DEF float __clc_tanf_piby4(float x, int regn) {
-  // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
-  float r = x * x;
-
-  float a =
-      __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
-
-  float b = __clc_mad(
-      r,
-      __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
-      1.15588821434688393452299f);
-
-  float t = __clc_mad(x * r, native_divide(a, b), x);
-  float tr = -MATH_RECIP(t);
-
-  return regn & 1 ? tr : t;
-}
-
 #ifdef cl_khr_fp64

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

+#define bytealign(src0, src1, src2)                                            \
+  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
+
 // Reduction for medium sized arguments
 _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
                                           private double *rr,
--- a/libclc/generic/lib/math/sincos_helpers.h
+++ b/libclc/generic/lib/math/sincos_helpers.h
@ -9,8 +9,6 @@
 #include <clc/clcfunc.h>
 #include <clc/clctypes.h>

-_CLC_DECL float __clc_tanf_piby4(float x, int y);
-
 #ifdef cl_khr_fp64

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
--- a/libclc/generic/lib/math/sincospiF_piby4.h
+++ b/libclc/generic/lib/math/sincospiF_piby4.h
@ -1,46 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/math/clc_mad.h>
-
-// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE float2 __libclc__sincosf_piby4(float x) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  // = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  // = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-
-  const float sc1 = -0.166666666638608441788607926e0F;
-  const float sc2 = 0.833333187633086262120839299e-2F;
-  const float sc3 = -0.198400874359527693921333720e-3F;
-  const float sc4 = 0.272500015145584081596826911e-5F;
-
-  const float cc1 = 0.41666666664325175238031e-1F;
-  const float cc2 = -0.13888887673175665567647e-2F;
-  const float cc3 = 0.24800600878112441958053e-4F;
-  const float cc4 = -0.27301013343179832472841e-6F;
-
-  float x2 = x * x;
-
-  float2 ret;
-  ret.x = __clc_mad(
-      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
-      x);
-  ret.y = __clc_mad(
-      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
-      __clc_mad(x2, -0.5f, 1.0f));
-  return ret;
-}
--- a/libclc/generic/lib/math/sinpi.cl
+++ b/libclc/generic/lib/math/sinpi.cl
@ -7,119 +7,9 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
-#include <clc/clcmacro.h>
-#include <clc/math/math.h>
+#include <clc/math/clc_sinpi.h>

-#include "sincospiF_piby4.h"
-#ifdef cl_khr_fp64
-#include "sincosD_piby4.h"
-#endif
+#define FUNCTION sinpi
+#define __CLC_BODY <clc/shared/unary_def.inc>

-_CLC_OVERLOAD _CLC_DEF float sinpi(float x)
-{
-    int ix = as_int(x);
-    int xsgn = ix & 0x80000000;
-    ix ^= xsgn;
-    float ax = as_float(ix);
-    int iax = (int)ax;
-    float r = ax - iax;
-    int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0);
-
-    // Initialize with return for +-Inf and NaN
-    int ir = 0x7fc00000;
-
-    // 2^23 <= |x| < Inf, the result is always integer
-    ir = ix < 0x7f800000 ? xsgn : ir;
-
-    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-    // r < 1.0
-    float a = 1.0f - r;
-    int e = 0;
-
-    // r <= 0.75
-    int c = r <= 0.75f;
-    a = c ? r - 0.5f : a;
-    e = c ? 1 : e;
-
-    // r < 0.5
-    c = r < 0.5f;
-    a = c ? 0.5f - r : a;
-
-    // 0 < r <= 0.25
-    c = r <= 0.25f;
-    a = c ? r : a;
-    e = c ? 0 : e;
-
-    float2 t = __libclc__sincosf_piby4(a * M_PI_F);
-    int jr = xodd ^ as_int(e ? t.hi : t.lo);
-
-    ir = ix < 0x4b000000 ? jr : ir;
-
-    return as_float(ir);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sinpi, float);
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double sinpi(double x)
-{
-    long ix = as_long(x);
-    long xsgn = ix & 0x8000000000000000L;
-    ix ^= xsgn;
-    double ax = as_double(ix);
-    long iax = (long)ax;
-    double r = ax - (double)iax;
-    long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L);
-
-    // Initialize with return for +-Inf and NaN
-    long ir = 0x7ff8000000000000L;
-
-    // 2^23 <= |x| < Inf, the result is always integer
-    ir = ix < 0x7ff0000000000000 ? xsgn : ir;
-
-    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-    // r < 1.0
-    double a = 1.0 - r;
-    int e = 0;
-
-    //  r <= 0.75
-    int c = r <= 0.75;
-    double t = r - 0.5;
-    a = c ? t : a;
-    e = c ? 1 : e;
-
-    // r < 0.5
-    c = r < 0.5;
-    t = 0.5 - r;
-    a = c ? t : a;
-
-    // r <= 0.25
-    c = r <= 0.25;
-    a = c ? r : a;
-    e = c ? 0 : e;
-
-    double api = a * M_PI;
-    double2 sc = __libclc__sincos_piby4(api, 0.0);
-    long jr = xodd ^ as_long(e ? sc.hi : sc.lo);
-
-    ir = ax < 0x1.0p+52 ? jr : ir;
-
-    return as_double(ir);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN_FP16(sinpi)
-
-#endif
+#include <clc/math/gentype.inc>
--- a/libclc/generic/lib/math/tanpi.cl
+++ b/libclc/generic/lib/math/tanpi.cl
@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//

 #include <clc/clc.h>
+#include <clc/math/clc_tanpi.h>

-#include <math/clc_tanpi.h>
+#define FUNCTION tanpi
+#define __CLC_BODY <clc/shared/unary_def.inc>

-#define __CLC_FUNC tanpi
-#define __CLC_BODY <clc_sw_unary.inc>
 #include <clc/math/gentype.inc>
--- a/libclc/spirv/lib/SOURCES
+++ b/libclc/spirv/lib/SOURCES
@ -72,7 +72,6 @@ math/fma.cl
 ../../generic/lib/math/clc_tan.cl
 ../../generic/lib/math/tan.cl
 ../../generic/lib/math/tanh.cl
-../../generic/lib/math/clc_tanpi.cl
 ../../generic/lib/math/tanpi.cl
 ../../generic/lib/math/tgamma.cl
 ../../generic/lib/shared/vload.cl