mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-16 11:06:33 +00:00
[X86] combineConcatVectorOps - require free concatenation of at least one operand of UNPCKL\H (#135366)
Stop just replacing 2*UNPCK+INSERT_SUBVECTOR with 2*INSERT_SUBVECTOR+UNPCK Currently limited to sub-64-bit element cases until we've accounted for the remaining regressions from some build_vector style patterns.
This commit is contained in:
parent
58211f55c5
commit
07b439605a
@ -58245,17 +58245,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
|
||||
case X86ISD::UNPCKL: {
|
||||
// TODO: UNPCK should use CombineSubOperand
|
||||
// Don't concatenate build_vector patterns.
|
||||
if (!IsSplat && EltSizeInBits >= 32 &&
|
||||
((VT.is256BitVector() && Subtarget.hasInt256()) ||
|
||||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
|
||||
if (!IsSplat &&
|
||||
((VT.is256BitVector() &&
|
||||
(EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
|
||||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
|
||||
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
|
||||
none_of(Ops, [](SDValue Op) {
|
||||
return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
|
||||
ISD::SCALAR_TO_VECTOR ||
|
||||
peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
|
||||
ISD::SCALAR_TO_VECTOR;
|
||||
})) {
|
||||
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
|
||||
ConcatSubOperand(VT, Ops, 1));
|
||||
SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
|
||||
SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
|
||||
if (Concat0 || Concat1 ||
|
||||
(Subtarget.hasInt256() && EltSizeInBits == 64))
|
||||
return DAG.getNode(Opcode, DL, VT,
|
||||
Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
|
||||
Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2442,7 +2442,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
|
||||
;
|
||||
; AVX2-LABEL: cvt_16f32_to_16i16:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: subq $184, %rsp
|
||||
; AVX2-NEXT: subq $152, %rsp
|
||||
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
@ -2450,51 +2450,28 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
@ -2504,17 +2481,29 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
@ -2525,7 +2514,19 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
@ -2536,12 +2537,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
|
||||
; AVX2-NEXT: addq $184, %rsp
|
||||
; AVX2-NEXT: addq $152, %rsp
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; F16C-LABEL: cvt_16f32_to_16i16:
|
||||
@ -2925,7 +2926,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-LABEL: store_cvt_16f32_to_16i16:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: pushq %rbx
|
||||
; AVX2-NEXT: subq $176, %rsp
|
||||
; AVX2-NEXT: subq $144, %rsp
|
||||
; AVX2-NEXT: movq %rdi, %rbx
|
||||
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
@ -2934,51 +2935,28 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
@ -2988,17 +2966,29 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
@ -3009,7 +2999,19 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
@ -3020,13 +3022,13 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
|
||||
; AVX2-NEXT: vmovdqa %ymm0, (%rbx)
|
||||
; AVX2-NEXT: addq $176, %rsp
|
||||
; AVX2-NEXT: addq $144, %rsp
|
||||
; AVX2-NEXT: popq %rbx
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
@ -4672,7 +4674,7 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-LABEL: store_cvt_32f32_to_32f16:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: pushq %rbx
|
||||
; AVX2-NEXT: subq $240, %rsp
|
||||
; AVX2-NEXT: subq $208, %rsp
|
||||
; AVX2-NEXT: movq %rdi, %rbx
|
||||
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
@ -4689,23 +4691,8 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
@ -4713,18 +4700,33 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
@ -4737,28 +4739,29 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
@ -4769,9 +4772,9 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
@ -4787,75 +4790,53 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
@ -4867,15 +4848,38 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,0]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
|
||||
; AVX2-NEXT: callq __truncsfhf2@PLT
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
|
||||
; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
|
||||
; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
|
||||
; AVX2-NEXT: vmovdqa %ymm0, 32(%rbx)
|
||||
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
||||
; AVX2-NEXT: vmovaps %ymm0, (%rbx)
|
||||
; AVX2-NEXT: addq $240, %rsp
|
||||
; AVX2-NEXT: addq $208, %rsp
|
||||
; AVX2-NEXT: popq %rbx
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -130,14 +130,13 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
;
|
||||
; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
|
||||
; AVX512BW-FCP: # %bb.0:
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
|
||||
; AVX512BW-FCP-NEXT: vzeroupper
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
|
||||
; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
|
||||
; AVX512BW-FCP-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-BW-LABEL: store_i16_stride4_vf2:
|
||||
@ -153,14 +152,13 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
;
|
||||
; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
|
||||
; AVX512DQ-BW-FCP: # %bb.0:
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
|
||||
; AVX512DQ-BW-FCP-NEXT: vzeroupper
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
|
||||
; AVX512DQ-BW-FCP-NEXT: retq
|
||||
%in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
|
||||
%in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
|
||||
|
@ -84,13 +84,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -109,13 +109,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
|
||||
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -134,13 +134,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
|
||||
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -159,13 +159,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
|
||||
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -184,13 +184,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
|
||||
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -209,13 +209,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
|
||||
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -234,13 +234,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
|
||||
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -259,13 +259,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
|
||||
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
|
||||
@ -280,13 +280,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
|
||||
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
|
||||
@ -301,13 +301,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
|
||||
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
|
||||
@ -322,13 +322,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
|
||||
|
@ -77,16 +77,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -99,16 +99,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -121,16 +121,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -143,16 +143,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -165,16 +165,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -187,16 +187,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -209,16 +209,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
|
||||
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
|
||||
@ -231,17 +231,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
|
||||
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
|
||||
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512BW-NEXT: vmovdqa %ymm2, (%rax)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
@ -252,17 +252,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
|
||||
; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
|
||||
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
|
||||
; AVX512BW-FCP-NEXT: vzeroupper
|
||||
; AVX512BW-FCP-NEXT: retq
|
||||
@ -273,17 +273,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
|
||||
; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
|
||||
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax)
|
||||
; AVX512DQ-BW-NEXT: vzeroupper
|
||||
; AVX512DQ-BW-NEXT: retq
|
||||
@ -294,17 +294,17 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,48,0,32,24,56,8,40,17,49,1,33,25,57,9,41]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
|
||||
; AVX512DQ-BW-FCP-NEXT: vzeroupper
|
||||
; AVX512DQ-BW-FCP-NEXT: retq
|
||||
|
@ -407,13 +407,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -431,13 +431,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -455,13 +455,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -479,13 +479,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
|
||||
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -503,13 +503,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
|
||||
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -527,13 +527,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
|
||||
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -551,13 +551,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
|
||||
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -575,13 +575,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -599,13 +599,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -623,13 +623,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
@ -647,13 +647,13 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
|
@ -426,16 +426,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -449,16 +449,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -472,16 +472,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -495,16 +495,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -518,16 +518,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -541,16 +541,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -564,16 +564,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -587,16 +587,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -610,16 +610,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -633,16 +633,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
@ -656,16 +656,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
|
||||
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
|
||||
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
|
||||
; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
|
Loading…
x
Reference in New Issue
Block a user