[SLPVectorizer] Support SLPVectorizer cases of tan across all backends (#95517)

This PR is intended to address the limited SLPVectorizer support of tan
raised in the comments of this PR:
https://github.com/llvm/llvm-project/pull/94559.

Right now emitting the tan intrinsisic allows you to vectorize tan, but
emitting the libfunc does not. to address this the libcall needs to be
mapped to the intrinsic. and the libcall and function name need to be
marked approriately so they can be optimized or defined as a call
lowering.
This commit is contained in:
Farzon Lotfi 2024-06-27 12:15:13 -07:00 committed by GitHub
parent aa8409fdca
commit 918313d17d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 43 additions and 18 deletions

View File

@ -410,10 +410,12 @@ public:
return false;
switch (F) {
default: break;
// clang-format off
case LibFunc_copysign: case LibFunc_copysignf: case LibFunc_copysignl:
case LibFunc_fabs: case LibFunc_fabsf: case LibFunc_fabsl:
case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinl:
case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl:
case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanl:
case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl:
case LibFunc_sqrt_finite: case LibFunc_sqrtf_finite:
case LibFunc_sqrtl_finite:
@ -432,6 +434,7 @@ public:
case LibFunc_memcmp: case LibFunc_bcmp: case LibFunc_strcmp:
case LibFunc_strcpy: case LibFunc_stpcpy: case LibFunc_strlen:
case LibFunc_strnlen: case LibFunc_memchr: case LibFunc_mempcpy:
// clang-format on
return true;
}
return false;

View File

@ -156,14 +156,17 @@ public:
StringRef Name = F->getName();
// These will all likely lower to a single selection DAG node.
// clang-format off
if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
Name == "fmin" || Name == "fminf" || Name == "fminl" ||
Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
Name == "sin" || Name == "sinf" || Name == "sinl" ||
Name == "cos" || Name == "cosf" || Name == "cosl" ||
Name == "tan" || Name == "tanf" || Name == "tanl" ||
Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
return false;
// clang-format on
// These are all likely to be optimized into something smaller.
if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
Name == "exp2l" || Name == "exp2f" || Name == "floor" ||

View File

@ -3994,6 +3994,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
case LibFunc_cosf:
case LibFunc_cosl:
return Intrinsic::cos;
case LibFunc_tan:
case LibFunc_tanf:
case LibFunc_tanl:
return Intrinsic::tan;
case LibFunc_exp:
case LibFunc_expf:
case LibFunc_expl:

View File

@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16

View File

@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16

View File

@ -6,6 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
declare double @sin(double) nounwind willreturn
declare double @cos(double) nounwind willreturn
declare double @tan(double) nounwind willreturn
declare double @pow(double, double) nounwind willreturn
declare double @exp2(double) nounwind willreturn
declare double @sqrt(double) nounwind willreturn
@ -48,6 +49,24 @@ define void @cos_libm(ptr %a, ptr %b) {
ret void
}
define void @tan_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @tan_libm(
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.tan.v2f64(<2 x double> [[TMP2]])
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
%idx1 = getelementptr inbounds double, ptr %a, i64 1
%a1 = load double, ptr %idx1, align 8
%tan1 = tail call double @tan(double %a0) nounwind readnone
%tan2 = tail call double @tan(double %a1) nounwind readnone
store double %tan1, ptr %b, align 8
%idx2 = getelementptr inbounds double, ptr %b, i64 1
store double %tan2, ptr %idx2, align 8
ret void
}
define void @pow_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @pow_libm(
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8