llvm-project/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll
Stefan Pintilie e9d12c2480 [PowerPC][NFC] Add a series of codegen tests for vector reductions.
This patch only adds tests for PowerPC. The purpose of these tests
is to track what code is generated for various vector reductions.

Reviewed By: nemanjai, #powerpc

Differential Revision: https://reviews.llvm.org/D113801
2021-11-19 15:03:01 -06:00

205 lines
6.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
; PWR9LE-LABEL: v2i32:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: xxspltw v3, v2, 2
; PWR9LE-NEXT: li r3, 0
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: vextuwrx r3, r3, v2
; PWR9LE-NEXT: blr
;
; PWR9BE-LABEL: v2i32:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: xxspltw v3, v2, 1
; PWR9BE-NEXT: li r3, 0
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: vextuwlx r3, r3, v2
; PWR9BE-NEXT: blr
;
; PWR10LE-LABEL: v2i32:
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: xxspltw v3, v2, 2
; PWR10LE-NEXT: li r3, 0
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: vextuwrx r3, r3, v2
; PWR10LE-NEXT: blr
;
; PWR10BE-LABEL: v2i32:
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: xxspltw v3, v2, 1
; PWR10BE-NEXT: li r3, 0
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: vextuwlx r3, r3, v2
; PWR10BE-NEXT: blr
entry:
%0 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
ret i32 %0
}
define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
; PWR9LE-LABEL: v4i32:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: xxswapd v3, v2
; PWR9LE-NEXT: li r3, 0
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: xxspltw v3, v2, 2
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: vextuwrx r3, r3, v2
; PWR9LE-NEXT: blr
;
; PWR9BE-LABEL: v4i32:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: xxswapd v3, v2
; PWR9BE-NEXT: li r3, 0
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: xxspltw v3, v2, 1
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: vextuwlx r3, r3, v2
; PWR9BE-NEXT: blr
;
; PWR10LE-LABEL: v4i32:
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: xxswapd v3, v2
; PWR10LE-NEXT: li r3, 0
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: xxspltw v3, v2, 2
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: vextuwrx r3, r3, v2
; PWR10LE-NEXT: blr
;
; PWR10BE-LABEL: v4i32:
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: xxswapd v3, v2
; PWR10BE-NEXT: li r3, 0
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: xxspltw v3, v2, 1
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: vextuwlx r3, r3, v2
; PWR10BE-NEXT: blr
entry:
%0 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
ret i32 %0
}
define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
; PWR9LE-LABEL: v8i32:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: li r3, 0
; PWR9LE-NEXT: xxswapd v3, v2
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: xxspltw v3, v2, 2
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: vextuwrx r3, r3, v2
; PWR9LE-NEXT: blr
;
; PWR9BE-LABEL: v8i32:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: li r3, 0
; PWR9BE-NEXT: xxswapd v3, v2
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: xxspltw v3, v2, 1
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: vextuwlx r3, r3, v2
; PWR9BE-NEXT: blr
;
; PWR10LE-LABEL: v8i32:
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: li r3, 0
; PWR10LE-NEXT: xxswapd v3, v2
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: xxspltw v3, v2, 2
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: vextuwrx r3, r3, v2
; PWR10LE-NEXT: blr
;
; PWR10BE-LABEL: v8i32:
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: li r3, 0
; PWR10BE-NEXT: xxswapd v3, v2
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: xxspltw v3, v2, 1
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: vextuwlx r3, r3, v2
; PWR10BE-NEXT: blr
entry:
%0 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
ret i32 %0
}
define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
; PWR9LE-LABEL: v16i32:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: vmuluwm v3, v3, v5
; PWR9LE-NEXT: vmuluwm v2, v2, v4
; PWR9LE-NEXT: li r3, 0
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: xxswapd v3, v2
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: xxspltw v3, v2, 2
; PWR9LE-NEXT: vmuluwm v2, v2, v3
; PWR9LE-NEXT: vextuwrx r3, r3, v2
; PWR9LE-NEXT: blr
;
; PWR9BE-LABEL: v16i32:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: vmuluwm v3, v3, v5
; PWR9BE-NEXT: vmuluwm v2, v2, v4
; PWR9BE-NEXT: li r3, 0
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: xxswapd v3, v2
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: xxspltw v3, v2, 1
; PWR9BE-NEXT: vmuluwm v2, v2, v3
; PWR9BE-NEXT: vextuwlx r3, r3, v2
; PWR9BE-NEXT: blr
;
; PWR10LE-LABEL: v16i32:
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: vmuluwm v3, v3, v5
; PWR10LE-NEXT: vmuluwm v2, v2, v4
; PWR10LE-NEXT: li r3, 0
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: xxswapd v3, v2
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: xxspltw v3, v2, 2
; PWR10LE-NEXT: vmuluwm v2, v2, v3
; PWR10LE-NEXT: vextuwrx r3, r3, v2
; PWR10LE-NEXT: blr
;
; PWR10BE-LABEL: v16i32:
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: vmuluwm v3, v3, v5
; PWR10BE-NEXT: vmuluwm v2, v2, v4
; PWR10BE-NEXT: li r3, 0
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: xxswapd v3, v2
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: xxspltw v3, v2, 1
; PWR10BE-NEXT: vmuluwm v2, v2, v3
; PWR10BE-NEXT: vextuwlx r3, r3, v2
; PWR10BE-NEXT: blr
entry:
%0 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a)
ret i32 %0
}
declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) #0
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) #0
declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) #0
declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) #0
attributes #0 = { nounwind }