llvm-project/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu

// Verify the behavior of the denormal-fp-mode attributes in the way that
// rocm-device-libs should be built with. The bitcode should be compiled with
// denormal-fp-math-f32=dynamic, and should be replaced with the denormal mode
// of the final TU.

// Build the fake device library in the way rocm-device-libs should be built.
//
// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa -fdenormal-fp-math-f32=dynamic \
// RUN:   -mcode-object-version=none -emit-llvm-bc \
// RUN:   %S/Inputs/ocml-sample.cl -o %t.dynamic.f32.bc
//
// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa -fdenormal-fp-math=dynamic \
// RUN:   -mcode-object-version=none -emit-llvm-bc \
// RUN:   %S/Inputs/ocml-sample.cl -o %t.dynamic.full.bc


// Check the default behavior with no denormal-fp-math arguments.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc \
// RUN:   -emit-llvm %s -o - | FileCheck -implicit-check-not=denormal-fp-math %s --check-prefixes=CHECK,INTERNALIZE


// Check an explicit full ieee request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
// RUN:    -fdenormal-fp-math=ieee \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc \
// RUN:   -emit-llvm %s -o - | FileCheck -implicit-check-not=denormal-fp-math %s --check-prefixes=CHECK,INTERNALIZE


// Check explicit f32-only flushing request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF64-PSZF32


// Check explicit flush all request. Only the f32 component of the library is
// dynamic, so the linked functions should use IEEE as the base mode and the new
// functions preserve-sign.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math=preserve-sign \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,PSZ


// Check explicit f32-only, ieee-other flushing request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=preserve-sign \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF64-PSZF32


// Check inverse of normal usage. Requesting IEEE f32, with flushed f16/f64
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN:   -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF32-PSZF64-DYNF32


// Check backwards from the normal usage where both library components can be
// overridden.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN:   -mlink-builtin-bitcode %t.dynamic.full.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF32-PSZF64-DYNFULL


// Check the case where no internalization is performed
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN:   -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN:   -mlink-bitcode-file %t.dynamic.full.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,NOINTERNALIZE,NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL


#define __device__ __attribute__((device))
#define __global__ __attribute__((global))

typedef _Float16 half;

extern "C" {
__device__ half do_f16_stuff(half a, half  b, half c);
__device__ float do_f32_stuff(float a, float b, float c);

// Currently all library functions are internalized. Check a weak function in
// case we ever choose to not internalize these. In that case, the safest thing
// to do would likely be to preserve the dynamic denormal-fp-math.
__attribute__((weak)) __device__ float weak_do_f32_stuff(float a, float b, float c);
__device__ double do_f64_stuff(double a, double b, double c);


  // CHECK: kernel_f16({{.*}}) #[[$KERNELATTR:[0-9]+]]
__global__ void kernel_f16(float* out, float* a, float* b, float* c) {
  int id = 0;
  out[id] = do_f16_stuff(a[id], b[id], c[id]);
}

// CHECK: kernel_f32({{.*}}) #[[$KERNELATTR]]
__global__ void kernel_f32(float* out, float* a, float* b, float* c) {
  int id = 0;
  out[id] = do_f32_stuff(a[id], b[id], c[id]);
  out[id] += weak_do_f32_stuff(a[id], b[id], c[id]);
}

// CHECK: kernel_f64({{.*}}) #[[$KERNELATTR]]
__global__ void kernel_f64(double* out, double* a, double* b, double* c) {
  int id = 0;
  out[id] = do_f64_stuff(a[id], b[id], c[id]);
}
}

// INTERNALIZE: define internal {{(noundef )?}}half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
// INTERNALIZE: define internal {{(noundef )?}}float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
// INTERNALIZE: define internal {{(noundef )?}}double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
// INTERNALIZE: define internal {{(noundef )?}}float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]


// NOINTERNALIZE: define dso_local {{(noundef )?}}half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
// NOINTERNALIZE: define dso_local {{(noundef )?}}float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
// NOINTERNALIZE: define dso_local {{(noundef )?}}double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
// NOINTERNALIZE: define weak {{(noundef )?}}float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]


// We should not be littering call sites with the attribute
// Everything should use the default ieee with no explicit attribute

// FIXME: Should check-not "denormal-fp-math" within the denormal-fp-math-f32
// lines.

// Default mode relies on the implicit check-not for the denormal-fp-math.

// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"

// FIXME: Should check-not "denormal-fp-math" within the line
// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"

// IEEEF32-PSZF64-DYNF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}}  }
// implicit check-not
// implicit check-not


// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"

// -mlink-bitcode-file doesn't internalize or propagate attributes.
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="dynamic,dynamic" {{.*}} }
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="dynamic,dynamic" {{.*}} }