mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-29 08:26:07 +00:00
[Utils][UnifyLoopExits] Avoid costly updates if nothing changed (#129179)
If the ControlFlowHub did not perform any change to the control flow, there is no need to repair SSA, update the loop structure, and verify a bunch of things. This is not completely NFC though, repairSSA introduced PHI nodes with a single entry that are now missing. My code went from 400+ seconds to 1 second, since no loop required the exits to be unified, but there were many "complex" loops.
This commit is contained in:
parent
926600a805
commit
992b451f08
@ -110,7 +110,9 @@ struct ControlFlowHub {
|
||||
Branches.emplace_back(BB, Succ0, Succ1);
|
||||
}
|
||||
|
||||
BasicBlock *
|
||||
/// Return the unified loop exit block and a flag indicating if the CFG was
|
||||
/// changed at all.
|
||||
std::pair<BasicBlock *, bool>
|
||||
finalize(DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
|
||||
const StringRef Prefix,
|
||||
std::optional<unsigned> MaxControlFlowBooleans = std::nullopt);
|
||||
|
@ -270,7 +270,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
|
||||
}
|
||||
}
|
||||
|
||||
BasicBlock *ControlFlowHub::finalize(
|
||||
std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
|
||||
DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
|
||||
const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
|
||||
#ifndef NDEBUG
|
||||
@ -289,7 +289,7 @@ BasicBlock *ControlFlowHub::finalize(
|
||||
}
|
||||
|
||||
if (Outgoing.size() < 2)
|
||||
return Outgoing.front();
|
||||
return {Outgoing.front(), false};
|
||||
|
||||
SmallVector<DominatorTree::UpdateType, 16> Updates;
|
||||
if (DTU) {
|
||||
@ -338,5 +338,5 @@ BasicBlock *ControlFlowHub::finalize(
|
||||
Inst->eraseFromParent();
|
||||
}
|
||||
|
||||
return FirstGuardBlock;
|
||||
return {FirstGuardBlock, true};
|
||||
}
|
||||
|
@ -169,8 +169,12 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
|
||||
|
||||
SmallVector<BasicBlock *, 8> GuardBlocks;
|
||||
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
|
||||
BasicBlock *LoopExitBlock = CHub.finalize(
|
||||
BasicBlock *LoopExitBlock;
|
||||
bool ChangedCFG;
|
||||
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
|
||||
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
|
||||
if (!ChangedCFG)
|
||||
return false;
|
||||
|
||||
restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
|
||||
|
||||
|
@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s5, 0
|
||||
; GFX10-NEXT: ; implicit-def: $sgpr6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX10-NEXT: s_branch .LBB4_2
|
||||
; GFX10-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
|
||||
@ -312,6 +312,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
|
||||
; GFX10-NEXT: s_cbranch_execz .LBB4_6
|
||||
; GFX10-NEXT: .LBB4_2: ; %cond.block.0
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, v5
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
|
||||
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
|
||||
; GFX10-NEXT: s_cbranch_execz .LBB4_4
|
||||
@ -328,11 +329,12 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
|
||||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
|
||||
; GFX10-NEXT: s_mov_b32 s7, -1
|
||||
; GFX10-NEXT: ; implicit-def: $vgpr5
|
||||
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
|
||||
; GFX10-NEXT: s_cbranch_execz .LBB4_1
|
||||
; GFX10-NEXT: ; %bb.5: ; %loop.cond
|
||||
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
|
||||
; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
|
||||
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
|
||||
; GFX10-NEXT: s_or_b32 s7, s4, s7
|
||||
|
@ -7646,9 +7646,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
|
||||
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB28_2
|
||||
; GFX7-NEXT: ; %bb.3: ; %Flow23
|
||||
; GFX7-NEXT: ; %bb.3: ; %Flow22
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: .LBB28_4: ; %Flow24
|
||||
; GFX7-NEXT: .LBB28_4: ; %Flow23
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
|
||||
@ -7676,7 +7676,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
|
||||
; GFX7-NEXT: .LBB28_7: ; %Flow22
|
||||
; GFX7-NEXT: .LBB28_7: ; %Flow21
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
|
||||
@ -7725,7 +7725,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB28_11
|
||||
; GFX7-NEXT: ; %bb.12: ; %Flow
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX7-NEXT: .LBB28_13: ; %Flow20
|
||||
; GFX7-NEXT: .LBB28_13: ; %Flow19
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
|
||||
@ -7770,9 +7770,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
|
||||
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB28_2
|
||||
; GFX6-NEXT: ; %bb.3: ; %Flow21
|
||||
; GFX6-NEXT: ; %bb.3: ; %Flow20
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: .LBB28_4: ; %Flow22
|
||||
; GFX6-NEXT: .LBB28_4: ; %Flow21
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
|
||||
@ -7800,7 +7800,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
|
||||
; GFX6-NEXT: .LBB28_7: ; %Flow20
|
||||
; GFX6-NEXT: .LBB28_7: ; %Flow19
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
|
||||
@ -7849,7 +7849,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB28_11
|
||||
; GFX6-NEXT: ; %bb.12: ; %Flow
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX6-NEXT: .LBB28_13: ; %Flow18
|
||||
; GFX6-NEXT: .LBB28_13: ; %Flow17
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s4, v2
|
||||
@ -8483,9 +8483,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
|
||||
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB29_2
|
||||
; GFX7-NEXT: ; %bb.3: ; %Flow23
|
||||
; GFX7-NEXT: ; %bb.3: ; %Flow22
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: .LBB29_4: ; %Flow24
|
||||
; GFX7-NEXT: .LBB29_4: ; %Flow23
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
|
||||
@ -8513,7 +8513,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
|
||||
; GFX7-NEXT: .LBB29_7: ; %Flow22
|
||||
; GFX7-NEXT: .LBB29_7: ; %Flow21
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
|
||||
@ -8562,7 +8562,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX7-NEXT: s_cbranch_execnz .LBB29_11
|
||||
; GFX7-NEXT: ; %bb.12: ; %Flow
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX7-NEXT: .LBB29_13: ; %Flow20
|
||||
; GFX7-NEXT: .LBB29_13: ; %Flow19
|
||||
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
|
||||
@ -8607,9 +8607,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
|
||||
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB29_2
|
||||
; GFX6-NEXT: ; %bb.3: ; %Flow21
|
||||
; GFX6-NEXT: ; %bb.3: ; %Flow20
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: .LBB29_4: ; %Flow22
|
||||
; GFX6-NEXT: .LBB29_4: ; %Flow21
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
|
||||
@ -8637,7 +8637,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
|
||||
; GFX6-NEXT: .LBB29_7: ; %Flow20
|
||||
; GFX6-NEXT: .LBB29_7: ; %Flow19
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
|
||||
@ -8686,7 +8686,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX6-NEXT: s_cbranch_execnz .LBB29_11
|
||||
; GFX6-NEXT: ; %bb.12: ; %Flow
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX6-NEXT: .LBB29_13: ; %Flow18
|
||||
; GFX6-NEXT: .LBB29_13: ; %Flow17
|
||||
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s4, v2
|
||||
|
@ -39,6 +39,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.3:
|
||||
; GFX942-NEXT: ; implicit-def: $sgpr3
|
||||
; GFX942-NEXT: ; implicit-def: $agpr0
|
||||
; GFX942-NEXT: .LBB0_4: ; %common.ret
|
||||
; GFX942-NEXT: s_endpgm
|
||||
;
|
||||
@ -79,6 +80,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
|
||||
; GFX908-NEXT: ; %bb.3:
|
||||
; GFX908-NEXT: ; implicit-def: $sgpr3
|
||||
; GFX908-NEXT: ; implicit-def: $agpr0
|
||||
; GFX908-NEXT: .LBB0_4: ; %common.ret
|
||||
; GFX908-NEXT: s_endpgm
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user