mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-24 06:26:07 +00:00
[RISCV] Allow hoisting VXRM writes out of loops speculatively (#110044)
Change the intersect for the anticipated algorithm to ignore unknown when anticipating. This effectively allows VXRM writes speculatively because it could do a VXRM write even when there's branches where VXRM is unneeded. The importance of this change is because VXRM writes causes pipeline flushes in some micro-architectures and so it makes sense to allow more aggressive hoisting even if it causes some degradation for the slow path. An example is this code: ``` typedef unsigned char uint8_t; __attribute__ ((noipa)) void foo (uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; } } ``` With this patch, the code above generates a hoisting VXRM writes out of the outer loop.
This commit is contained in:
parent
febbf9105f
commit
e3fdc3aa81
@ -1453,6 +1453,9 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
|
||||
def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
|
||||
"Ventana Veyron-Series processors">;
|
||||
|
||||
def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
|
||||
"true", "VXRM writes causes pipeline flush">;
|
||||
|
||||
// Assume that lock-free native-width atomics are available, even if the target
|
||||
// and operating system combination would not usually provide them. The user
|
||||
// is responsible for providing any necessary __sync implementations. Code
|
||||
|
@ -109,6 +109,35 @@ public:
|
||||
return VXRMInfo::getUnknown();
|
||||
}
|
||||
|
||||
// Calculate the VXRMInfo visible to a block assuming this and Other
|
||||
// are both predecessors. To allow speculatively running WriteVXRM
|
||||
// we will ignore Unknowns if one of this and Other have valid
|
||||
// WriteVXRM. Rationale: WriteVXRM causes a pipeline flush in some
|
||||
// uarchs and moving it outside loops is very important for some
|
||||
// workloads.
|
||||
VXRMInfo intersectAnticipated(const VXRMInfo &Other) const {
|
||||
// If the new value isn't valid, ignore it.
|
||||
if (!Other.isValid())
|
||||
return *this;
|
||||
|
||||
// If this value isn't valid, this must be the first predecessor, use it.
|
||||
if (!isValid())
|
||||
return Other;
|
||||
|
||||
// If either is unknown, the result is the other one.
|
||||
if (isUnknown())
|
||||
return Other;
|
||||
if (Other.isUnknown())
|
||||
return *this;
|
||||
|
||||
// If we have an exact match, return this.
|
||||
if (*this == Other)
|
||||
return *this;
|
||||
|
||||
// Otherwise the result is unknown.
|
||||
return VXRMInfo::getUnknown();
|
||||
}
|
||||
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
/// Support for debugging, callable in GDB: V->dump()
|
||||
LLVM_DUMP_METHOD void dump() const {
|
||||
@ -187,7 +216,7 @@ public:
|
||||
private:
|
||||
bool computeVXRMChanges(const MachineBasicBlock &MBB);
|
||||
void computeAvailable(const MachineBasicBlock &MBB);
|
||||
void computeAnticipated(const MachineBasicBlock &MBB);
|
||||
void computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB);
|
||||
void emitWriteVXRM(MachineBasicBlock &MBB);
|
||||
};
|
||||
|
||||
@ -279,8 +308,9 @@ void RISCVInsertWriteVXRM::computeAvailable(const MachineBasicBlock &MBB) {
|
||||
}
|
||||
}
|
||||
|
||||
void RISCVInsertWriteVXRM::computeAnticipated(const MachineBasicBlock &MBB) {
|
||||
void RISCVInsertWriteVXRM::computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB) {
|
||||
BlockData &BBInfo = BlockInfo[MBB.getNumber()];
|
||||
const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
|
||||
|
||||
BBInfo.InQueue = false;
|
||||
|
||||
@ -289,7 +319,11 @@ void RISCVInsertWriteVXRM::computeAnticipated(const MachineBasicBlock &MBB) {
|
||||
Anticipated.setUnknown();
|
||||
} else {
|
||||
for (const MachineBasicBlock *S : MBB.successors())
|
||||
Anticipated =
|
||||
if (ST.hasVXRMPipelineFlush())
|
||||
Anticipated =
|
||||
Anticipated.intersectAnticipated(BlockInfo[S->getNumber()].AnticipatedIn);
|
||||
else
|
||||
Anticipated =
|
||||
Anticipated.intersect(BlockInfo[S->getNumber()].AnticipatedIn);
|
||||
}
|
||||
|
||||
@ -453,7 +487,7 @@ bool RISCVInsertWriteVXRM::runOnMachineFunction(MachineFunction &MF) {
|
||||
while (!WorkList.empty()) {
|
||||
const MachineBasicBlock &MBB = *WorkList.front();
|
||||
WorkList.pop();
|
||||
computeAnticipated(MBB);
|
||||
computeAnticipated(MF, MBB);
|
||||
}
|
||||
|
||||
// Phase 4 - Emit VXRM writes at the earliest place possible.
|
||||
|
@ -277,7 +277,8 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
|
||||
FeatureUnalignedScalarMem,
|
||||
FeatureUnalignedVectorMem]),
|
||||
!listconcat(SiFiveP400TuneFeatures,
|
||||
[TuneNoSinkSplatOperands])>;
|
||||
[TuneNoSinkSplatOperands,
|
||||
TuneVXRMPipelineFlush])>;
|
||||
|
||||
|
||||
def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
|
||||
@ -298,6 +299,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
|
||||
TuneLUIADDIFusion,
|
||||
TuneAUIPCADDIFusion,
|
||||
TuneNoSinkSplatOperands,
|
||||
TuneVXRMPipelineFlush,
|
||||
FeaturePostRAScheduler]>;
|
||||
|
||||
def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
|
||||
@ -510,7 +512,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
|
||||
[TuneDLenFactor2,
|
||||
TuneOptimizedNF2SegmentLoadStore,
|
||||
TuneOptimizedNF3SegmentLoadStore,
|
||||
TuneOptimizedNF4SegmentLoadStore]> {
|
||||
TuneOptimizedNF4SegmentLoadStore,
|
||||
TuneVXRMPipelineFlush]> {
|
||||
let MVendorID = 0x710;
|
||||
let MArchID = 0x8000000058000001;
|
||||
let MImpID = 0x1000000049772200;
|
||||
|
666
llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
Normal file
666
llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
Normal file
@ -0,0 +1,666 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
||||
|
||||
; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+m \
|
||||
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV32
|
||||
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=sifive-p670 \
|
||||
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64P670
|
||||
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=spacemit-x60 \
|
||||
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64X60
|
||||
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m \
|
||||
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64
|
||||
|
||||
|
||||
; test1
|
||||
define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_dst_stride, ptr nocapture noundef readonly %src1, i32 noundef signext %i_src1_stride, ptr nocapture noundef readonly %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) {
|
||||
; RV32-LABEL: test1:
|
||||
; RV32: # %bb.0: # %entry
|
||||
; RV32-NEXT: blez a7, .LBB0_17
|
||||
; RV32-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph
|
||||
; RV32-NEXT: blez a6, .LBB0_17
|
||||
; RV32-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: addi t0, a7, -1
|
||||
; RV32-NEXT: csrr t2, vlenb
|
||||
; RV32-NEXT: mul t3, a1, t0
|
||||
; RV32-NEXT: mul t4, a3, t0
|
||||
; RV32-NEXT: mul t5, a5, t0
|
||||
; RV32-NEXT: slli t1, t2, 1
|
||||
; RV32-NEXT: li t6, 32
|
||||
; RV32-NEXT: mv t0, t1
|
||||
; RV32-NEXT: bnez zero, .LBB0_4
|
||||
; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: li t0, 32
|
||||
; RV32-NEXT: .LBB0_4: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: addi sp, sp, -16
|
||||
; RV32-NEXT: .cfi_def_cfa_offset 16
|
||||
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
|
||||
; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
|
||||
; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
|
||||
; RV32-NEXT: .cfi_offset s0, -4
|
||||
; RV32-NEXT: .cfi_offset s1, -8
|
||||
; RV32-NEXT: .cfi_offset s2, -12
|
||||
; RV32-NEXT: add t3, a0, t3
|
||||
; RV32-NEXT: add t4, a2, t4
|
||||
; RV32-NEXT: add s0, a4, t5
|
||||
; RV32-NEXT: bltu t6, t1, .LBB0_6
|
||||
; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: li t1, 32
|
||||
; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: add t3, t3, a6
|
||||
; RV32-NEXT: add t5, t4, a6
|
||||
; RV32-NEXT: add t4, s0, a6
|
||||
; RV32-NEXT: beqz zero, .LBB0_8
|
||||
; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: mv t1, t0
|
||||
; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader
|
||||
; RV32-NEXT: li t0, 0
|
||||
; RV32-NEXT: sltu t5, a0, t5
|
||||
; RV32-NEXT: sltu t6, a2, t3
|
||||
; RV32-NEXT: and t5, t5, t6
|
||||
; RV32-NEXT: sltu t4, a0, t4
|
||||
; RV32-NEXT: sltu t3, a4, t3
|
||||
; RV32-NEXT: and t3, t4, t3
|
||||
; RV32-NEXT: or t4, a1, a3
|
||||
; RV32-NEXT: slti t4, t4, 0
|
||||
; RV32-NEXT: or t4, t5, t4
|
||||
; RV32-NEXT: or t5, a1, a5
|
||||
; RV32-NEXT: sltu t1, a6, t1
|
||||
; RV32-NEXT: slti t5, t5, 0
|
||||
; RV32-NEXT: or t3, t3, t5
|
||||
; RV32-NEXT: or t3, t4, t3
|
||||
; RV32-NEXT: or t1, t1, t3
|
||||
; RV32-NEXT: andi t1, t1, 1
|
||||
; RV32-NEXT: slli t2, t2, 1
|
||||
; RV32-NEXT: j .LBB0_10
|
||||
; RV32-NEXT: .LBB0_9: # %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1
|
||||
; RV32-NEXT: add a0, a0, a1
|
||||
; RV32-NEXT: add a2, a2, a3
|
||||
; RV32-NEXT: addi t0, t0, 1
|
||||
; RV32-NEXT: add a4, a4, a5
|
||||
; RV32-NEXT: beq t0, a7, .LBB0_16
|
||||
; RV32-NEXT: .LBB0_10: # %for.cond1.preheader.us
|
||||
; RV32-NEXT: # =>This Loop Header: Depth=1
|
||||
; RV32-NEXT: # Child Loop BB0_13 Depth 2
|
||||
; RV32-NEXT: # Child Loop BB0_15 Depth 2
|
||||
; RV32-NEXT: beqz t1, .LBB0_12
|
||||
; RV32-NEXT: # %bb.11: # in Loop: Header=BB0_10 Depth=1
|
||||
; RV32-NEXT: li t4, 0
|
||||
; RV32-NEXT: li t3, 0
|
||||
; RV32-NEXT: j .LBB0_15
|
||||
; RV32-NEXT: .LBB0_12: # %vector.ph
|
||||
; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1
|
||||
; RV32-NEXT: li t3, 0
|
||||
; RV32-NEXT: neg t4, t2
|
||||
; RV32-NEXT: and t4, t4, a6
|
||||
; RV32-NEXT: csrwi vxrm, 0
|
||||
; RV32-NEXT: li t6, 0
|
||||
; RV32-NEXT: li t5, 0
|
||||
; RV32-NEXT: vsetvli s0, zero, e8, m2, ta, ma
|
||||
; RV32-NEXT: .LBB0_13: # %vector.body
|
||||
; RV32-NEXT: # Parent Loop BB0_10 Depth=1
|
||||
; RV32-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV32-NEXT: add s0, a2, t6
|
||||
; RV32-NEXT: add s1, a4, t6
|
||||
; RV32-NEXT: vl2r.v v8, (s0)
|
||||
; RV32-NEXT: add s0, a0, t6
|
||||
; RV32-NEXT: vl2r.v v10, (s1)
|
||||
; RV32-NEXT: add s1, t6, t2
|
||||
; RV32-NEXT: sltu t6, s1, t6
|
||||
; RV32-NEXT: add t5, t5, t6
|
||||
; RV32-NEXT: xor t6, s1, t4
|
||||
; RV32-NEXT: vaaddu.vv v8, v8, v10
|
||||
; RV32-NEXT: or s2, t6, t5
|
||||
; RV32-NEXT: vs2r.v v8, (s0)
|
||||
; RV32-NEXT: mv t6, s1
|
||||
; RV32-NEXT: bnez s2, .LBB0_13
|
||||
; RV32-NEXT: # %bb.14: # %middle.block
|
||||
; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1
|
||||
; RV32-NEXT: beq t4, a6, .LBB0_9
|
||||
; RV32-NEXT: .LBB0_15: # %for.body4.us
|
||||
; RV32-NEXT: # Parent Loop BB0_10 Depth=1
|
||||
; RV32-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV32-NEXT: add t5, a2, t4
|
||||
; RV32-NEXT: add t6, a4, t4
|
||||
; RV32-NEXT: add s0, a0, t4
|
||||
; RV32-NEXT: lbu t5, 0(t5)
|
||||
; RV32-NEXT: lbu t6, 0(t6)
|
||||
; RV32-NEXT: addi t4, t4, 1
|
||||
; RV32-NEXT: seqz s1, t4
|
||||
; RV32-NEXT: add t3, t3, s1
|
||||
; RV32-NEXT: add t5, t5, t6
|
||||
; RV32-NEXT: xor t6, t4, a6
|
||||
; RV32-NEXT: addi t5, t5, 1
|
||||
; RV32-NEXT: srli t5, t5, 1
|
||||
; RV32-NEXT: or t6, t6, t3
|
||||
; RV32-NEXT: sb t5, 0(s0)
|
||||
; RV32-NEXT: bnez t6, .LBB0_15
|
||||
; RV32-NEXT: j .LBB0_9
|
||||
; RV32-NEXT: .LBB0_16:
|
||||
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
|
||||
; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
|
||||
; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
|
||||
; RV32-NEXT: .cfi_restore s0
|
||||
; RV32-NEXT: .cfi_restore s1
|
||||
; RV32-NEXT: .cfi_restore s2
|
||||
; RV32-NEXT: addi sp, sp, 16
|
||||
; RV32-NEXT: .cfi_def_cfa_offset 0
|
||||
; RV32-NEXT: .LBB0_17: # %for.cond.cleanup
|
||||
; RV32-NEXT: ret
|
||||
;
|
||||
; RV64P670-LABEL: test1:
|
||||
; RV64P670: # %bb.0: # %entry
|
||||
; RV64P670-NEXT: csrwi vxrm, 0
|
||||
; RV64P670-NEXT: blez a7, .LBB0_12
|
||||
; RV64P670-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph
|
||||
; RV64P670-NEXT: blez a6, .LBB0_12
|
||||
; RV64P670-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
|
||||
; RV64P670-NEXT: addi sp, sp, -48
|
||||
; RV64P670-NEXT: .cfi_def_cfa_offset 48
|
||||
; RV64P670-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
|
||||
; RV64P670-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
|
||||
; RV64P670-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
|
||||
; RV64P670-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
|
||||
; RV64P670-NEXT: sd s4, 8(sp) # 8-byte Folded Spill
|
||||
; RV64P670-NEXT: .cfi_offset s0, -8
|
||||
; RV64P670-NEXT: .cfi_offset s1, -16
|
||||
; RV64P670-NEXT: .cfi_offset s2, -24
|
||||
; RV64P670-NEXT: .cfi_offset s3, -32
|
||||
; RV64P670-NEXT: .cfi_offset s4, -40
|
||||
; RV64P670-NEXT: addi s1, a7, -1
|
||||
; RV64P670-NEXT: add s0, a0, a6
|
||||
; RV64P670-NEXT: li t0, 0
|
||||
; RV64P670-NEXT: li t1, 0
|
||||
; RV64P670-NEXT: zext.w s1, s1
|
||||
; RV64P670-NEXT: mul t2, a1, s1
|
||||
; RV64P670-NEXT: add t4, s0, t2
|
||||
; RV64P670-NEXT: mul t2, a3, s1
|
||||
; RV64P670-NEXT: add s0, a2, a6
|
||||
; RV64P670-NEXT: mul s1, a5, s1
|
||||
; RV64P670-NEXT: add t3, s0, t2
|
||||
; RV64P670-NEXT: add s0, a4, a6
|
||||
; RV64P670-NEXT: csrr t2, vlenb
|
||||
; RV64P670-NEXT: add t5, s0, s1
|
||||
; RV64P670-NEXT: sltu s1, a0, t3
|
||||
; RV64P670-NEXT: sltu s0, a2, t4
|
||||
; RV64P670-NEXT: slli t3, t2, 1
|
||||
; RV64P670-NEXT: and s0, s0, s1
|
||||
; RV64P670-NEXT: or s1, a1, a3
|
||||
; RV64P670-NEXT: slti s1, s1, 0
|
||||
; RV64P670-NEXT: or t6, s0, s1
|
||||
; RV64P670-NEXT: sltu s1, a0, t5
|
||||
; RV64P670-NEXT: sltu s0, a4, t4
|
||||
; RV64P670-NEXT: mv t5, a0
|
||||
; RV64P670-NEXT: and s0, s0, s1
|
||||
; RV64P670-NEXT: or s1, a1, a5
|
||||
; RV64P670-NEXT: slti s1, s1, 0
|
||||
; RV64P670-NEXT: or s0, s0, s1
|
||||
; RV64P670-NEXT: li s1, 32
|
||||
; RV64P670-NEXT: maxu s1, t3, s1
|
||||
; RV64P670-NEXT: or s0, t6, s0
|
||||
; RV64P670-NEXT: sltu s1, a6, s1
|
||||
; RV64P670-NEXT: or s0, s0, s1
|
||||
; RV64P670-NEXT: andi t4, s0, 1
|
||||
; RV64P670-NEXT: j .LBB0_4
|
||||
; RV64P670-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64P670-NEXT: add t5, t5, a1
|
||||
; RV64P670-NEXT: add a2, a2, a3
|
||||
; RV64P670-NEXT: add a4, a4, a5
|
||||
; RV64P670-NEXT: addiw t1, t1, 1
|
||||
; RV64P670-NEXT: addi t0, t0, 1
|
||||
; RV64P670-NEXT: beq t1, a7, .LBB0_11
|
||||
; RV64P670-NEXT: .LBB0_4: # %for.cond1.preheader.us
|
||||
; RV64P670-NEXT: # =>This Loop Header: Depth=1
|
||||
; RV64P670-NEXT: # Child Loop BB0_7 Depth 2
|
||||
; RV64P670-NEXT: # Child Loop BB0_10 Depth 2
|
||||
; RV64P670-NEXT: beqz t4, .LBB0_6
|
||||
; RV64P670-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64P670-NEXT: li t6, 0
|
||||
; RV64P670-NEXT: j .LBB0_9
|
||||
; RV64P670-NEXT: .LBB0_6: # %vector.ph
|
||||
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64P670-NEXT: slli s1, t2, 28
|
||||
; RV64P670-NEXT: mv s2, a2
|
||||
; RV64P670-NEXT: mv s3, a4
|
||||
; RV64P670-NEXT: mv s4, t5
|
||||
; RV64P670-NEXT: sub s1, s1, t3
|
||||
; RV64P670-NEXT: vsetvli s0, zero, e8, m2, ta, ma
|
||||
; RV64P670-NEXT: and t6, s1, a6
|
||||
; RV64P670-NEXT: mv s1, t6
|
||||
; RV64P670-NEXT: .LBB0_7: # %vector.body
|
||||
; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1
|
||||
; RV64P670-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64P670-NEXT: vl2r.v v8, (s2)
|
||||
; RV64P670-NEXT: sub s1, s1, t3
|
||||
; RV64P670-NEXT: add s2, s2, t3
|
||||
; RV64P670-NEXT: vl2r.v v10, (s3)
|
||||
; RV64P670-NEXT: add s3, s3, t3
|
||||
; RV64P670-NEXT: vaaddu.vv v8, v8, v10
|
||||
; RV64P670-NEXT: vs2r.v v8, (s4)
|
||||
; RV64P670-NEXT: add s4, s4, t3
|
||||
; RV64P670-NEXT: bnez s1, .LBB0_7
|
||||
; RV64P670-NEXT: # %bb.8: # %middle.block
|
||||
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64P670-NEXT: beq t6, a6, .LBB0_3
|
||||
; RV64P670-NEXT: .LBB0_9: # %for.body4.us.preheader
|
||||
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64P670-NEXT: mul s2, a1, t0
|
||||
; RV64P670-NEXT: add s0, a0, a6
|
||||
; RV64P670-NEXT: add s1, t5, t6
|
||||
; RV64P670-NEXT: add s4, a4, t6
|
||||
; RV64P670-NEXT: add t6, t6, a2
|
||||
; RV64P670-NEXT: add s2, s2, s0
|
||||
; RV64P670-NEXT: .LBB0_10: # %for.body4.us
|
||||
; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1
|
||||
; RV64P670-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64P670-NEXT: lbu s3, 0(t6)
|
||||
; RV64P670-NEXT: lbu s0, 0(s4)
|
||||
; RV64P670-NEXT: addi s4, s4, 1
|
||||
; RV64P670-NEXT: addi t6, t6, 1
|
||||
; RV64P670-NEXT: add s0, s0, s3
|
||||
; RV64P670-NEXT: addi s0, s0, 1
|
||||
; RV64P670-NEXT: srli s0, s0, 1
|
||||
; RV64P670-NEXT: sb s0, 0(s1)
|
||||
; RV64P670-NEXT: addi s1, s1, 1
|
||||
; RV64P670-NEXT: bne s1, s2, .LBB0_10
|
||||
; RV64P670-NEXT: j .LBB0_3
|
||||
; RV64P670-NEXT: .LBB0_11:
|
||||
; RV64P670-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
|
||||
; RV64P670-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
|
||||
; RV64P670-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
|
||||
; RV64P670-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
|
||||
; RV64P670-NEXT: ld s4, 8(sp) # 8-byte Folded Reload
|
||||
; RV64P670-NEXT: .cfi_restore s0
|
||||
; RV64P670-NEXT: .cfi_restore s1
|
||||
; RV64P670-NEXT: .cfi_restore s2
|
||||
; RV64P670-NEXT: .cfi_restore s3
|
||||
; RV64P670-NEXT: .cfi_restore s4
|
||||
; RV64P670-NEXT: addi sp, sp, 48
|
||||
; RV64P670-NEXT: .cfi_def_cfa_offset 0
|
||||
; RV64P670-NEXT: .LBB0_12: # %for.cond.cleanup
|
||||
; RV64P670-NEXT: ret
|
||||
;
|
||||
; RV64X60-LABEL: test1:
|
||||
; RV64X60: # %bb.0: # %entry
|
||||
; RV64X60-NEXT: csrwi vxrm, 0
|
||||
; RV64X60-NEXT: blez a7, .LBB0_12
|
||||
; RV64X60-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph
|
||||
; RV64X60-NEXT: blez a6, .LBB0_12
|
||||
; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
|
||||
; RV64X60-NEXT: addi sp, sp, -48
|
||||
; RV64X60-NEXT: .cfi_def_cfa_offset 48
|
||||
; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: .cfi_offset s0, -8
|
||||
; RV64X60-NEXT: .cfi_offset s1, -16
|
||||
; RV64X60-NEXT: .cfi_offset s2, -24
|
||||
; RV64X60-NEXT: .cfi_offset s3, -32
|
||||
; RV64X60-NEXT: .cfi_offset s4, -40
|
||||
; RV64X60-NEXT: li t0, 0
|
||||
; RV64X60-NEXT: li t1, 0
|
||||
; RV64X60-NEXT: addi t2, a7, -1
|
||||
; RV64X60-NEXT: add t4, a0, a6
|
||||
; RV64X60-NEXT: add t5, a2, a6
|
||||
; RV64X60-NEXT: add t3, a4, a6
|
||||
; RV64X60-NEXT: zext.w s0, t2
|
||||
; RV64X60-NEXT: mul s1, a1, s0
|
||||
; RV64X60-NEXT: add t4, t4, s1
|
||||
; RV64X60-NEXT: mul s1, a3, s0
|
||||
; RV64X60-NEXT: add t5, t5, s1
|
||||
; RV64X60-NEXT: csrr t2, vlenb
|
||||
; RV64X60-NEXT: mul s1, a5, s0
|
||||
; RV64X60-NEXT: add t3, t3, s1
|
||||
; RV64X60-NEXT: sltu s1, a0, t5
|
||||
; RV64X60-NEXT: sltu s0, a2, t4
|
||||
; RV64X60-NEXT: and t6, s1, s0
|
||||
; RV64X60-NEXT: li t5, 32
|
||||
; RV64X60-NEXT: sltu s1, a0, t3
|
||||
; RV64X60-NEXT: sltu s0, a4, t4
|
||||
; RV64X60-NEXT: and t3, s1, s0
|
||||
; RV64X60-NEXT: or s1, a1, a3
|
||||
; RV64X60-NEXT: slti s1, s1, 0
|
||||
; RV64X60-NEXT: or t4, t6, s1
|
||||
; RV64X60-NEXT: or s0, a1, a5
|
||||
; RV64X60-NEXT: slti s0, s0, 0
|
||||
; RV64X60-NEXT: or s0, t3, s0
|
||||
; RV64X60-NEXT: slli t3, t2, 1
|
||||
; RV64X60-NEXT: maxu s1, t3, t5
|
||||
; RV64X60-NEXT: or s0, t4, s0
|
||||
; RV64X60-NEXT: sltu s1, a6, s1
|
||||
; RV64X60-NEXT: or s0, s0, s1
|
||||
; RV64X60-NEXT: andi t4, s0, 1
|
||||
; RV64X60-NEXT: mv t5, a0
|
||||
; RV64X60-NEXT: j .LBB0_4
|
||||
; RV64X60-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64X60-NEXT: add t5, t5, a1
|
||||
; RV64X60-NEXT: add a2, a2, a3
|
||||
; RV64X60-NEXT: add a4, a4, a5
|
||||
; RV64X60-NEXT: addiw t1, t1, 1
|
||||
; RV64X60-NEXT: addi t0, t0, 1
|
||||
; RV64X60-NEXT: beq t1, a7, .LBB0_11
|
||||
; RV64X60-NEXT: .LBB0_4: # %for.cond1.preheader.us
|
||||
; RV64X60-NEXT: # =>This Loop Header: Depth=1
|
||||
; RV64X60-NEXT: # Child Loop BB0_7 Depth 2
|
||||
; RV64X60-NEXT: # Child Loop BB0_10 Depth 2
|
||||
; RV64X60-NEXT: beqz t4, .LBB0_6
|
||||
; RV64X60-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64X60-NEXT: li t6, 0
|
||||
; RV64X60-NEXT: j .LBB0_9
|
||||
; RV64X60-NEXT: .LBB0_6: # %vector.ph
|
||||
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64X60-NEXT: slli s1, t2, 28
|
||||
; RV64X60-NEXT: sub s1, s1, t3
|
||||
; RV64X60-NEXT: and t6, s1, a6
|
||||
; RV64X60-NEXT: mv s2, a2
|
||||
; RV64X60-NEXT: mv s3, a4
|
||||
; RV64X60-NEXT: mv s4, t5
|
||||
; RV64X60-NEXT: mv s1, t6
|
||||
; RV64X60-NEXT: vsetvli s0, zero, e8, m2, ta, ma
|
||||
; RV64X60-NEXT: .LBB0_7: # %vector.body
|
||||
; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1
|
||||
; RV64X60-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64X60-NEXT: vl2r.v v8, (s2)
|
||||
; RV64X60-NEXT: vl2r.v v10, (s3)
|
||||
; RV64X60-NEXT: sub s1, s1, t3
|
||||
; RV64X60-NEXT: add s3, s3, t3
|
||||
; RV64X60-NEXT: vaaddu.vv v8, v8, v10
|
||||
; RV64X60-NEXT: vs2r.v v8, (s4)
|
||||
; RV64X60-NEXT: add s4, s4, t3
|
||||
; RV64X60-NEXT: add s2, s2, t3
|
||||
; RV64X60-NEXT: bnez s1, .LBB0_7
|
||||
; RV64X60-NEXT: # %bb.8: # %middle.block
|
||||
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64X60-NEXT: beq t6, a6, .LBB0_3
|
||||
; RV64X60-NEXT: .LBB0_9: # %for.body4.us.preheader
|
||||
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
|
||||
; RV64X60-NEXT: mul s2, a1, t0
|
||||
; RV64X60-NEXT: add s1, a0, a6
|
||||
; RV64X60-NEXT: add s0, t5, t6
|
||||
; RV64X60-NEXT: add s2, s2, s1
|
||||
; RV64X60-NEXT: add s4, a4, t6
|
||||
; RV64X60-NEXT: add t6, t6, a2
|
||||
; RV64X60-NEXT: .LBB0_10: # %for.body4.us
|
||||
; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1
|
||||
; RV64X60-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64X60-NEXT: lbu s3, 0(t6)
|
||||
; RV64X60-NEXT: lbu s1, 0(s4)
|
||||
; RV64X60-NEXT: add s1, s1, s3
|
||||
; RV64X60-NEXT: addi s1, s1, 1
|
||||
; RV64X60-NEXT: srli s1, s1, 1
|
||||
; RV64X60-NEXT: sb s1, 0(s0)
|
||||
; RV64X60-NEXT: addi s0, s0, 1
|
||||
; RV64X60-NEXT: addi s4, s4, 1
|
||||
; RV64X60-NEXT: addi t6, t6, 1
|
||||
; RV64X60-NEXT: bne s0, s2, .LBB0_10
|
||||
; RV64X60-NEXT: j .LBB0_3
|
||||
; RV64X60-NEXT: .LBB0_11:
|
||||
; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: .cfi_restore s0
|
||||
; RV64X60-NEXT: .cfi_restore s1
|
||||
; RV64X60-NEXT: .cfi_restore s2
|
||||
; RV64X60-NEXT: .cfi_restore s3
|
||||
; RV64X60-NEXT: .cfi_restore s4
|
||||
; RV64X60-NEXT: addi sp, sp, 48
|
||||
; RV64X60-NEXT: .cfi_def_cfa_offset 0
|
||||
; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup
|
||||
; RV64X60-NEXT: ret
|
||||
;
|
||||
; RV64-LABEL: test1:
|
||||
; RV64: # %bb.0: # %entry
|
||||
; RV64-NEXT: blez a7, .LBB0_14
|
||||
; RV64-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph
|
||||
; RV64-NEXT: blez a6, .LBB0_14
|
||||
; RV64-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
|
||||
; RV64-NEXT: addi sp, sp, -48
|
||||
; RV64-NEXT: .cfi_def_cfa_offset 48
|
||||
; RV64-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
|
||||
; RV64-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
|
||||
; RV64-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
|
||||
; RV64-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
|
||||
; RV64-NEXT: sd s4, 8(sp) # 8-byte Folded Spill
|
||||
; RV64-NEXT: .cfi_offset s0, -8
|
||||
; RV64-NEXT: .cfi_offset s1, -16
|
||||
; RV64-NEXT: .cfi_offset s2, -24
|
||||
; RV64-NEXT: .cfi_offset s3, -32
|
||||
; RV64-NEXT: .cfi_offset s4, -40
|
||||
; RV64-NEXT: addi t1, a7, -1
|
||||
; RV64-NEXT: add t5, a0, a6
|
||||
; RV64-NEXT: add s0, a2, a6
|
||||
; RV64-NEXT: add t6, a4, a6
|
||||
; RV64-NEXT: csrr t0, vlenb
|
||||
; RV64-NEXT: li t2, 32
|
||||
; RV64-NEXT: slli t1, t1, 32
|
||||
; RV64-NEXT: srli t3, t1, 32
|
||||
; RV64-NEXT: mul t1, a1, t3
|
||||
; RV64-NEXT: add t5, t5, t1
|
||||
; RV64-NEXT: mul t1, a3, t3
|
||||
; RV64-NEXT: add s0, s0, t1
|
||||
; RV64-NEXT: slli t1, t0, 1
|
||||
; RV64-NEXT: mul t3, a5, t3
|
||||
; RV64-NEXT: add t6, t6, t3
|
||||
; RV64-NEXT: mv t4, t1
|
||||
; RV64-NEXT: bltu t2, t1, .LBB0_4
|
||||
; RV64-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader
|
||||
; RV64-NEXT: li t4, 32
|
||||
; RV64-NEXT: .LBB0_4: # %for.cond1.preheader.us.preheader
|
||||
; RV64-NEXT: li t2, 0
|
||||
; RV64-NEXT: li t3, 0
|
||||
; RV64-NEXT: sltu s0, a0, s0
|
||||
; RV64-NEXT: sltu s1, a2, t5
|
||||
; RV64-NEXT: and s0, s0, s1
|
||||
; RV64-NEXT: sltu t6, a0, t6
|
||||
; RV64-NEXT: sltu t5, a4, t5
|
||||
; RV64-NEXT: and t5, t6, t5
|
||||
; RV64-NEXT: or t6, a1, a3
|
||||
; RV64-NEXT: slti t6, t6, 0
|
||||
; RV64-NEXT: or t6, s0, t6
|
||||
; RV64-NEXT: or s0, a1, a5
|
||||
; RV64-NEXT: slti s0, s0, 0
|
||||
; RV64-NEXT: or t5, t5, s0
|
||||
; RV64-NEXT: or t5, t6, t5
|
||||
; RV64-NEXT: sltu t4, a6, t4
|
||||
; RV64-NEXT: or t4, t4, t5
|
||||
; RV64-NEXT: andi t4, t4, 1
|
||||
; RV64-NEXT: mv t5, a0
|
||||
; RV64-NEXT: j .LBB0_6
|
||||
; RV64-NEXT: .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
|
||||
; RV64-NEXT: add t5, t5, a1
|
||||
; RV64-NEXT: add a2, a2, a3
|
||||
; RV64-NEXT: add a4, a4, a5
|
||||
; RV64-NEXT: addiw t3, t3, 1
|
||||
; RV64-NEXT: addi t2, t2, 1
|
||||
; RV64-NEXT: beq t3, a7, .LBB0_13
|
||||
; RV64-NEXT: .LBB0_6: # %for.cond1.preheader.us
|
||||
; RV64-NEXT: # =>This Loop Header: Depth=1
|
||||
; RV64-NEXT: # Child Loop BB0_9 Depth 2
|
||||
; RV64-NEXT: # Child Loop BB0_12 Depth 2
|
||||
; RV64-NEXT: beqz t4, .LBB0_8
|
||||
; RV64-NEXT: # %bb.7: # in Loop: Header=BB0_6 Depth=1
|
||||
; RV64-NEXT: li t6, 0
|
||||
; RV64-NEXT: j .LBB0_11
|
||||
; RV64-NEXT: .LBB0_8: # %vector.ph
|
||||
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
|
||||
; RV64-NEXT: slli t6, t0, 28
|
||||
; RV64-NEXT: sub t6, t6, t1
|
||||
; RV64-NEXT: and t6, t6, a6
|
||||
; RV64-NEXT: csrwi vxrm, 0
|
||||
; RV64-NEXT: mv s0, a2
|
||||
; RV64-NEXT: mv s1, a4
|
||||
; RV64-NEXT: mv s2, t5
|
||||
; RV64-NEXT: mv s3, t6
|
||||
; RV64-NEXT: vsetvli s4, zero, e8, m2, ta, ma
|
||||
; RV64-NEXT: .LBB0_9: # %vector.body
|
||||
; RV64-NEXT: # Parent Loop BB0_6 Depth=1
|
||||
; RV64-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64-NEXT: vl2r.v v8, (s0)
|
||||
; RV64-NEXT: vl2r.v v10, (s1)
|
||||
; RV64-NEXT: sub s3, s3, t1
|
||||
; RV64-NEXT: add s1, s1, t1
|
||||
; RV64-NEXT: vaaddu.vv v8, v8, v10
|
||||
; RV64-NEXT: vs2r.v v8, (s2)
|
||||
; RV64-NEXT: add s2, s2, t1
|
||||
; RV64-NEXT: add s0, s0, t1
|
||||
; RV64-NEXT: bnez s3, .LBB0_9
|
||||
; RV64-NEXT: # %bb.10: # %middle.block
|
||||
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
|
||||
; RV64-NEXT: beq t6, a6, .LBB0_5
|
||||
; RV64-NEXT: .LBB0_11: # %for.body4.us.preheader
|
||||
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
|
||||
; RV64-NEXT: mul s1, a1, t2
|
||||
; RV64-NEXT: add s2, a0, a6
|
||||
; RV64-NEXT: add s0, t5, t6
|
||||
; RV64-NEXT: add s1, s2, s1
|
||||
; RV64-NEXT: add s2, a4, t6
|
||||
; RV64-NEXT: add t6, a2, t6
|
||||
; RV64-NEXT: .LBB0_12: # %for.body4.us
|
||||
; RV64-NEXT: # Parent Loop BB0_6 Depth=1
|
||||
; RV64-NEXT: # => This Inner Loop Header: Depth=2
|
||||
; RV64-NEXT: lbu s3, 0(t6)
|
||||
; RV64-NEXT: lbu s4, 0(s2)
|
||||
; RV64-NEXT: add s3, s3, s4
|
||||
; RV64-NEXT: addi s3, s3, 1
|
||||
; RV64-NEXT: srli s3, s3, 1
|
||||
; RV64-NEXT: sb s3, 0(s0)
|
||||
; RV64-NEXT: addi s0, s0, 1
|
||||
; RV64-NEXT: addi s2, s2, 1
|
||||
; RV64-NEXT: addi t6, t6, 1
|
||||
; RV64-NEXT: bne s0, s1, .LBB0_12
|
||||
; RV64-NEXT: j .LBB0_5
|
||||
; RV64-NEXT: .LBB0_13:
|
||||
; RV64-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
|
||||
; RV64-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
|
||||
; RV64-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
|
||||
; RV64-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
|
||||
; RV64-NEXT: ld s4, 8(sp) # 8-byte Folded Reload
|
||||
; RV64-NEXT: .cfi_restore s0
|
||||
; RV64-NEXT: .cfi_restore s1
|
||||
; RV64-NEXT: .cfi_restore s2
|
||||
; RV64-NEXT: .cfi_restore s3
|
||||
; RV64-NEXT: .cfi_restore s4
|
||||
; RV64-NEXT: addi sp, sp, 48
|
||||
; RV64-NEXT: .cfi_def_cfa_offset 0
|
||||
; RV64-NEXT: .LBB0_14: # %for.cond.cleanup
|
||||
; RV64-NEXT: ret
|
||||
entry:
|
||||
%cmp29 = icmp sgt i32 %i_height, 0
|
||||
br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
|
||||
|
||||
for.cond1.preheader.lr.ph: ; preds = %entry
|
||||
%cmp227 = icmp sgt i32 %i_width, 0
|
||||
%idx.ext = sext i32 %i_dst_stride to i64
|
||||
%idx.ext12 = sext i32 %i_src1_stride to i64
|
||||
%idx.ext14 = sext i32 %i_src2_stride to i64
|
||||
br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
|
||||
|
||||
for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph
|
||||
%wide.trip.count = zext nneg i32 %i_width to i64
|
||||
%0 = add nsw i32 %i_height, -1
|
||||
%1 = zext i32 %0 to i64
|
||||
%2 = mul nsw i64 %idx.ext, %1
|
||||
%3 = getelementptr i8, ptr %dst, i64 %2
|
||||
%scevgep = getelementptr i8, ptr %3, i64 %wide.trip.count
|
||||
%4 = mul nsw i64 %idx.ext12, %1
|
||||
%5 = getelementptr i8, ptr %src1, i64 %4
|
||||
%scevgep36 = getelementptr i8, ptr %5, i64 %wide.trip.count
|
||||
%6 = mul nsw i64 %idx.ext14, %1
|
||||
%7 = getelementptr i8, ptr %src2, i64 %6
|
||||
%scevgep37 = getelementptr i8, ptr %7, i64 %wide.trip.count
|
||||
%8 = tail call i64 @llvm.vscale.i64()
|
||||
%9 = shl nuw nsw i64 %8, 4
|
||||
%10 = tail call i64 @llvm.umax.i64(i64 %9, i64 32)
|
||||
%min.iters.check = icmp ugt i64 %10, %wide.trip.count
|
||||
%bound0 = icmp ult ptr %dst, %scevgep36
|
||||
%bound1 = icmp ult ptr %src1, %scevgep
|
||||
%found.conflict = and i1 %bound0, %bound1
|
||||
%11 = or i32 %i_dst_stride, %i_src1_stride
|
||||
%12 = icmp slt i32 %11, 0
|
||||
%13 = or i1 %found.conflict, %12
|
||||
%bound039 = icmp ult ptr %dst, %scevgep37
|
||||
%bound140 = icmp ult ptr %src2, %scevgep
|
||||
%found.conflict41 = and i1 %bound039, %bound140
|
||||
%14 = or i32 %i_dst_stride, %i_src2_stride
|
||||
%15 = icmp slt i32 %14, 0
|
||||
%16 = or i1 %found.conflict41, %15
|
||||
%conflict.rdx = or i1 %13, %16
|
||||
br label %for.cond1.preheader.us
|
||||
|
||||
for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
%y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
|
||||
%dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ]
|
||||
%src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ]
|
||||
%src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ]
|
||||
%brmerge = select i1 %min.iters.check, i1 true, i1 %conflict.rdx
|
||||
br i1 %brmerge, label %for.body4.us.preheader, label %vector.ph
|
||||
|
||||
vector.ph: ; preds = %for.cond1.preheader.us
|
||||
%17 = tail call i64 @llvm.vscale.i64()
|
||||
%.neg = mul nuw nsw i64 %17, 2147483632
|
||||
%n.vec = and i64 %.neg, %wide.trip.count
|
||||
%18 = tail call i64 @llvm.vscale.i64()
|
||||
%19 = shl nuw nsw i64 %18, 4
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%20 = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %index
|
||||
%wide.load = load <vscale x 16 x i8>, ptr %20, align 1
|
||||
%21 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i16>
|
||||
%22 = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %index
|
||||
%wide.load44 = load <vscale x 16 x i8>, ptr %22, align 1
|
||||
%23 = zext <vscale x 16 x i8> %wide.load44 to <vscale x 16 x i16>
|
||||
%24 = add nuw nsw <vscale x 16 x i16> %21, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
|
||||
%25 = add nuw nsw <vscale x 16 x i16> %24, %23
|
||||
%26 = lshr <vscale x 16 x i16> %25, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
|
||||
%27 = trunc <vscale x 16 x i16> %26 to <vscale x 16 x i8>
|
||||
%28 = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %index
|
||||
store <vscale x 16 x i8> %27, ptr %28, align 1
|
||||
%index.next = add nuw i64 %index, %19
|
||||
%29 = icmp eq i64 %index.next, %n.vec
|
||||
br i1 %29, label %middle.block, label %vector.body
|
||||
|
||||
middle.block: ; preds = %vector.body
|
||||
%cmp.n = icmp eq i64 %n.vec, %wide.trip.count
|
||||
br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader
|
||||
|
||||
for.body4.us.preheader: ; preds = %for.cond1.preheader.us, %middle.block
|
||||
%indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ]
|
||||
br label %for.body4.us
|
||||
|
||||
for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ]
|
||||
%arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
|
||||
%30 = load i8, ptr %arrayidx.us, align 1
|
||||
%conv.us = zext i8 %30 to i16
|
||||
%arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
|
||||
%31 = load i8, ptr %arrayidx6.us, align 1
|
||||
%conv7.us = zext i8 %31 to i16
|
||||
%add.us = add nuw nsw i16 %conv.us, 1
|
||||
%add8.us = add nuw nsw i16 %add.us, %conv7.us
|
||||
%shr.us = lshr i16 %add8.us, 1
|
||||
%conv9.us = trunc nuw i16 %shr.us to i8
|
||||
%arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
|
||||
store i8 %conv9.us, ptr %arrayidx11.us, align 1
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
||||
br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
|
||||
|
||||
for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us, %middle.block
|
||||
%add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
|
||||
%add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
|
||||
%add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
|
||||
%inc17.us = add nuw nsw i32 %y.033.us, 1
|
||||
%exitcond35.not = icmp eq i32 %inc17.us, %i_height
|
||||
br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user