mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-29 18:36:06 +00:00
[SLPVectorizer] Support SLPVectorizer cases of tan across all backends (#95517)
This PR is intended to address the limited SLPVectorizer support of tan raised in the comments of this PR: https://github.com/llvm/llvm-project/pull/94559. Right now emitting the tan intrinsisic allows you to vectorize tan, but emitting the libfunc does not. to address this the libcall needs to be mapped to the intrinsic. and the libcall and function name need to be marked approriately so they can be optimized or defined as a call lowering.
This commit is contained in:
parent
aa8409fdca
commit
918313d17d
@ -410,10 +410,12 @@ public:
|
||||
return false;
|
||||
switch (F) {
|
||||
default: break;
|
||||
// clang-format off
|
||||
case LibFunc_copysign: case LibFunc_copysignf: case LibFunc_copysignl:
|
||||
case LibFunc_fabs: case LibFunc_fabsf: case LibFunc_fabsl:
|
||||
case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinl:
|
||||
case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl:
|
||||
case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanl:
|
||||
case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl:
|
||||
case LibFunc_sqrt_finite: case LibFunc_sqrtf_finite:
|
||||
case LibFunc_sqrtl_finite:
|
||||
@ -432,6 +434,7 @@ public:
|
||||
case LibFunc_memcmp: case LibFunc_bcmp: case LibFunc_strcmp:
|
||||
case LibFunc_strcpy: case LibFunc_stpcpy: case LibFunc_strlen:
|
||||
case LibFunc_strnlen: case LibFunc_memchr: case LibFunc_mempcpy:
|
||||
// clang-format on
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -156,14 +156,17 @@ public:
|
||||
StringRef Name = F->getName();
|
||||
|
||||
// These will all likely lower to a single selection DAG node.
|
||||
// clang-format off
|
||||
if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
|
||||
Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
|
||||
Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
|
||||
Name == "fmin" || Name == "fminf" || Name == "fminl" ||
|
||||
Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
|
||||
Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
|
||||
Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
|
||||
Name == "sin" || Name == "sinf" || Name == "sinl" ||
|
||||
Name == "cos" || Name == "cosf" || Name == "cosl" ||
|
||||
Name == "tan" || Name == "tanf" || Name == "tanl" ||
|
||||
Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
|
||||
return false;
|
||||
|
||||
// clang-format on
|
||||
// These are all likely to be optimized into something smaller.
|
||||
if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
|
||||
Name == "exp2l" || Name == "exp2f" || Name == "floor" ||
|
||||
|
@ -3994,6 +3994,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
|
||||
case LibFunc_cosf:
|
||||
case LibFunc_cosl:
|
||||
return Intrinsic::cos;
|
||||
case LibFunc_tan:
|
||||
case LibFunc_tanf:
|
||||
case LibFunc_tanl:
|
||||
return Intrinsic::tan;
|
||||
case LibFunc_exp:
|
||||
case LibFunc_expf:
|
||||
case LibFunc_expl:
|
||||
|
@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
|
||||
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
|
||||
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
|
||||
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
|
||||
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
|
||||
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
|
||||
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
|
||||
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
|
||||
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
|
||||
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
|
||||
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
|
||||
;
|
||||
entry:
|
||||
%0 = load <4 x float>, ptr %a, align 16
|
||||
|
@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
|
||||
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
|
||||
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
|
||||
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
|
||||
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
|
||||
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
|
||||
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
|
||||
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
|
||||
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
|
||||
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
|
||||
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
|
||||
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
|
||||
;
|
||||
entry:
|
||||
%0 = load <4 x float>, ptr %a, align 16
|
||||
|
@ -6,6 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
declare double @sin(double) nounwind willreturn
|
||||
declare double @cos(double) nounwind willreturn
|
||||
declare double @tan(double) nounwind willreturn
|
||||
declare double @pow(double, double) nounwind willreturn
|
||||
declare double @exp2(double) nounwind willreturn
|
||||
declare double @sqrt(double) nounwind willreturn
|
||||
@ -48,6 +49,24 @@ define void @cos_libm(ptr %a, ptr %b) {
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @tan_libm(ptr %a, ptr %b) {
|
||||
; CHECK-LABEL: @tan_libm(
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.tan.v2f64(<2 x double> [[TMP2]])
|
||||
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%a0 = load double, ptr %a, align 8
|
||||
%idx1 = getelementptr inbounds double, ptr %a, i64 1
|
||||
%a1 = load double, ptr %idx1, align 8
|
||||
%tan1 = tail call double @tan(double %a0) nounwind readnone
|
||||
%tan2 = tail call double @tan(double %a1) nounwind readnone
|
||||
store double %tan1, ptr %b, align 8
|
||||
%idx2 = getelementptr inbounds double, ptr %b, i64 1
|
||||
store double %tan2, ptr %idx2, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @pow_libm(ptr %a, ptr %b) {
|
||||
; CHECK-LABEL: @pow_libm(
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
|
||||
|
Loading…
x
Reference in New Issue
Block a user