Skip to content

Commit

Permalink
[GlobalISel][Localizer] Allow localization of a small number of repea…
Browse files Browse the repository at this point in the history
…ted phi uses.

We previously had a heuristic that if a value V was used multiple times in a
single PHI, then to avoid potentially rematerializing into many predecessors we
bail out. the phi uses only counted as a single use in the shouldLocalize() hook
because it counted the PHI as a single instruction use, not factoring in it may
have many incoming edges.

It turns out this heuristic is slightly too pessiistic, and allowing a small number
of these use uses to be localized can improve code size due to shortening live ranges,
especially if those ranges span a call.

This change results in some improvements in size on CTMark -Os:
Program                                       size.__text
                                              before         after           diff
kimwitu++/kc                                  451676.00      451860.00       0.0%
mafft/pairlocalalign                          241460.00      241540.00       0.0%
tramp3d-v4/tramp3d-v4                         389216.00      389208.00      -0.0%
7zip/7zip-benchmark                           587528.00      587464.00      -0.0%
Bullet/bullet                                 457424.00      457348.00      -0.0%
consumer-typeset/consumer-typeset             405472.00      405376.00      -0.0%
SPASS/SPASS                                   410288.00      410120.00      -0.0%
lencod/lencod                                 426396.00      426108.00      -0.1%
ClamAV/clamscan                               380108.00      379756.00      -0.1%
sqlite3/sqlite3                               283664.00      283372.00      -0.1%
                           Geomean difference                               -0.0%

I experimented with different variations and thresholds. Using 3 instead of 2 resulted in
a further 0.1% improvement on ClamAV but also regressed sqlite3 by the same %.
  • Loading branch information
aemerson committed Jan 10, 2024
1 parent 8f78dd4 commit 329c606
Show file tree
Hide file tree
Showing 11 changed files with 118 additions and 73 deletions.
5 changes: 1 addition & 4 deletions llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,7 @@ class Localizer : public MachineFunctionPass {

typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;

/// If \p Op is a phi operand and not unique in that phi, that is,
/// there are other operands in the phi with the same register,
/// return true.
bool isNonUniquePhiValue(MachineOperand &Op) const;
unsigned getNumPhiUses(MachineOperand &Op) const;

/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
Expand Down
46 changes: 26 additions & 20 deletions llvm/lib/CodeGen/GlobalISel/Localizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,19 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}

bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
MachineInstr *MI = Op.getParent();
if (!MI->isPHI())
return false;
return 0;

Register SrcReg = Op.getReg();
unsigned NumUses = 0;
for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
auto &MO = MI->getOperand(Idx);
if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
return true;
++NumUses;
}
return false;
return NumUses;
}

bool Localizer::localizeInterBlock(MachineFunction &MF,
Expand Down Expand Up @@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
continue;
}

// If the use is a phi operand that's not unique, don't try to localize.
// If we do, we can cause unnecessary instruction bloat by duplicating
// into each predecessor block, when the existing one is sufficient and
// allows for easier optimization later.
if (isNonUniquePhiValue(MOUse))
// PHIs look like a single user but can use the same register in multiple
// edges, causing remat into each predecessor. Allow this to a certain
// extent.
unsigned NumPhiUses = getNumPhiUses(MOUse);
const unsigned PhiThreshold = 2; // FIXME: Tune this more.
if (NumPhiUses > PhiThreshold)
continue;

LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
Expand Down Expand Up @@ -164,19 +166,23 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
if (!UseMI.isPHI())
Users.insert(&UseMI);
}
// If all the users were PHIs then they're not going to be in our block,
// don't try to move this instruction.
if (Users.empty())
continue;

MachineBasicBlock::iterator II(MI);
++II;
while (II != MBB.end() && !Users.count(&*II))
// If all the users were PHIs then they're not going to be in our block, we
// may still benefit from sinking, especially since the value might be live
// across a call.
if (Users.empty()) {
// Make sure we don't sink in between
// two terminator sequences by scanning forward, not backward.
II = MBB.getFirstTerminatorForward();
LLVM_DEBUG(dbgs() << "Only phi users: moving " << *MI << " to the end\n");
} else {
++II;

assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
<< '\n');
while (II != MBB.end() && !Users.count(&*II))
++II;
assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
<< '\n');
}

MI->removeFromParent();
MBB.insert(II, MI);
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down Expand Up @@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: liveins: $x0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr)
; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8)
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) {
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3
; PRESELECTION-NEXT: G_BR %bb.2
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: bb.2.common.ret:
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1
; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32)
; PRESELECTION-NEXT: RET_ReallyLR implicit $w0
; PRESELECTION-NEXT: {{ $}}
Expand All @@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) {
; POSTSELECTION-NEXT: {{ $}}
; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3
; POSTSELECTION-NEXT: B %bb.2
; POSTSELECTION-NEXT: {{ $}}
Expand Down
56 changes: 49 additions & 7 deletions llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@

define void @test_inttoptr() { ret void }
define void @many_local_use_intra_block() { ret void }
define void @non_local_phi_use_nonunique() { ret void }
define void @non_local_phi_single_use() { ret void }
define void @non_local_phi_three_uses() { ret void }

...

---
Expand Down Expand Up @@ -285,8 +287,8 @@ body: |
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]]
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: G_BR %bb.1
; Existing registers should be left untouched
Expand Down Expand Up @@ -566,12 +568,12 @@ body: |
...

---
name: non_local_phi_use_nonunique
name: non_local_phi_single_use
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_use_nonunique
; CHECK-LABEL: name: non_local_phi_single_use
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
Expand All @@ -582,12 +584,12 @@ body: |
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
; Don't localize the 1 into bb.1, because there are multiple edges
; using that register.
; Localize the 1 into bb.1, since the number of uses is under the threshold.
bb.0:
successors: %bb.1, %bb.2
Expand All @@ -606,3 +608,43 @@ body: |
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
---
name: non_local_phi_three_uses
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_three_uses
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]]
; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]]
; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32)
; CHECK: G_BRCOND %cond(s1), %bb.1
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
; Don't localize the 1 into bb.1, above the thresold of uses in the phi.
bb.0:
successors: %bb.1, %bb.2
%0:gpr(s32) = G_CONSTANT i32 1
%1:gpr(s32) = G_ADD %0, %0
%cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0
%cond:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cond(s1), %bb.1
G_BR %bb.2
bb.1:
successors: %bb.2
bb.2:
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
Original file line number Diff line number Diff line change
Expand Up @@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s2, 1
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $vgpr3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v3, s12
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_mov_b32 s2, 0
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
; GCN-NEXT: s_cselect_b32 s2, 1, 0
; GCN-NEXT: s_cselect_b32 s4, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: .LBB2_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s4, 1
; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
; LOOP-NEXT: s_cbranch_execz .LBB0_6
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: s_mov_b32 s6, 0
; LOOP-NEXT: s_mov_b32 s7, 0xf000
; LOOP-NEXT: s_mov_b64 s[4:5], 0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: .LBB0_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s0, s0, 1
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_udiv_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, -1
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s4, 1
; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, -1
; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_urem_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, -1
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s4, 1
; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, -1
; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
Expand Down

0 comments on commit 329c606

Please sign in to comment.