Skip to content

[AIE2P] Add SingletonCopyEdges dag mutator in pre-scheduler #469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: aie-public
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,47 @@ class BiasDepth : public ScheduleDAGMutation {
};
};

class SingletonCopyEdges : public ScheduleDAGMutation {
void apply(ScheduleDAGInstrs *DAG) override {
const MachineRegisterInfo &MRI = DAG->MRI;
MachineBasicBlock *MBB = DAG->getBB();

for (SUnit &SU : DAG->SUnits) {
MachineInstr &MI = *SU.getInstr();
if (!MI.isCopy())
continue;

auto MO = MI.getOperand(1);
if (!(MO.isReg() && MO.getReg().isPhysical()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

if (!MO.isReg() || !MO.getReg().isPhysical())
       continue;

continue;

auto PhysReg = MO.getReg();
if (!MBB->isLiveIn(PhysReg))
continue;

SDep SingletonCopyEdge(&SU, SDep::Artificial);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about moving this into the scope of the if below?


for (SUnit &SU2 : DAG->SUnits) {
MachineInstr &MI = *SU2.getInstr();

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we creating an Edge from SU to itself?

if (any_of(MI.defs(), [=, &MRI](const MachineOperand &MO) {
if (!MO.isReg() || MO.getReg().isPhysical())
return false;
auto RC = MRI.getRegClass(MO.getReg());
// a def with a singleton register class containing PhysReg from
// the Copy
if (RC->getNumRegs() == 1 && RC->contains(PhysReg))
return true;
return false;
})) {
SingletonCopyEdge.setLatency(0);
SU2.addPred(SingletonCopyEdge, /*Required=*/true);
}
}
}
};
};

class RegionEndEdges : public ScheduleDAGMutation {
void removeExitSUPreds(ScheduleDAGInstrs *DAG) {
SUnit &ExitSU = DAG->ExitSU;
Expand Down Expand Up @@ -856,6 +897,7 @@ AIEBaseSubtarget::getPreRAMutationsImpl(const Triple &TT) {
Mutations.emplace_back(std::make_unique<PropagateIncomingLatencies>());
if (EnableStrongCopyEdges)
Mutations.emplace_back(std::make_unique<EnforceCopyEdges>());
Mutations.emplace_back(std::make_unique<SingletonCopyEdges>());
return Mutations;
}

Expand Down
202 changes: 174 additions & 28 deletions llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -158,15 +158,15 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti
; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: lda p4, [p0, #0]; nopx
; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]; mov p3, p2
; CHECK-NEXT: lda dc0, [p3, #0]
; CHECK-NEXT: nop
; CHECK-NEXT: vlda sfl, [p1, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: movs p3, p2
; CHECK-NEXT: movs m0, r0; mov dn0, r1
; CHECK-NEXT: movs dj0, r2; mov p2, p4
; CHECK-NEXT: nop
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj0, r2
; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down Expand Up @@ -205,17 +205,17 @@ define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_ti
; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: vlda sfl, [p1, #0]; nopxm
; CHECK-NEXT: lda p5, [p0, #0]
; CHECK-NEXT: lda dc0, [p2, #0]
; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops
; CHECK-NEXT: lda p2, [p0, #0]
; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: lda r26, [p1, dj0]; mov p4, p2
; CHECK-NEXT: lda dc0, [p4, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: movs p4, p2
; CHECK-NEXT: movs m0, r0; mov dn0, r1
; CHECK-NEXT: movs dn4, r3; mov dj4, r4
; CHECK-NEXT: movs dj0, r2; mov p2, p5
; CHECK-NEXT: nop
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj4, r4
; CHECK-NEXT: movs dn4, r3; mov dj0, r2
; CHECK-NEXT: vst.flush.512.3d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: st dc0, [p4, #0]
Expand Down Expand Up @@ -335,16 +335,16 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta
; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopx
; CHECK-NEXT: lda p4, [p0, #0]
; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm
; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]; mov p3, p2
; CHECK-NEXT: lda dc0, [p3, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: movs p3, p2
; CHECK-NEXT: movs m0, r0; mov dn0, r1
; CHECK-NEXT: movs dj0, r2; mov p2, p4
; CHECK-NEXT: nop
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj0, r2
; CHECK-NEXT: vst.flush.512.2d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: st dc0, [p3, #0]; ret lr
Expand Down Expand Up @@ -381,16 +381,16 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta
; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: lda p5, [p0, #0]; nopb ; nopx
; CHECK-NEXT: lda dc0, [p2, #0]
; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopxm
; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: lda r26, [p1, dj0]; mov p4, p2
; CHECK-NEXT: lda dc0, [p4, #0]
; CHECK-NEXT: nop
; CHECK-NEXT: vlda sfl, [p1, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]; movs p4, p2
; CHECK-NEXT: movs m0, r0; mov dn0, r1
; CHECK-NEXT: movs dn4, r3; mov dj4, r4
; CHECK-NEXT: movs dj0, r2; mov p2, p5
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj4, r4
; CHECK-NEXT: movs dn4, r3; mov dj0, r2
; CHECK-NEXT: vst.flush.512.conv.3d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down Expand Up @@ -753,6 +753,152 @@ entry:
ret void
}

define dso_local void @_Z26test_fifo_st_flush_2d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #1 {
; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRii:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops
; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: movs p3, p2
; CHECK-NEXT: nop
; CHECK-NEXT: vst.flush.512 [p2, sf, r26]
; CHECK-NEXT: nop
; CHECK-NEXT: st r26, [p1, dj0]
; CHECK-NEXT: vst sfl, [p1, #0]
; CHECK-NEXT: vst sfh, [p1, #64]
; CHECK-NEXT: st p2, [p0, #0]
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: lda dc0, [p3, #0]
; CHECK-NEXT: nop
; CHECK-NEXT: vlda sfl, [p1, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj0, r2
; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: st dc0, [p3, #0]; ret lr
; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5
; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4
; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3
; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%pos1.i.i = getelementptr inbounds i8, ptr %s, i20 128
%0 = load ptr, ptr %p, align 4, !tbaa !7
%1 = load <32 x i32>, ptr %s, align 64
%2 = load i32, ptr %pos1.i.i, align 64
%3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.p0.p0(ptr %0, <32 x i32> %1, i32 %2)
%4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
%5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
%6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
store <32 x i32> %5, ptr %s, align 128
store i32 %6, ptr %pos1.i.i, align 64
store ptr %4, ptr %p, align 4
%7 = load <32 x i32>, ptr %s, align 64
%8 = load i32, ptr %pos1.i.i, align 64
%9 = trunc i32 %off to i20
%10 = trunc i32 %size1 to i20
%11 = load i32, ptr %count1, align 4
%12 = trunc i32 %11 to i20
%13 = trunc i32 %inc1 to i20
%14 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv.p0.p0(ptr %4, <32 x i32> %7, i32 %8, i20 %9, i20 %10, i20 %12, i20 %13)
%15 = extractvalue { ptr, <32 x i32>, i32, i20 } %14, 0
%16 = extractvalue { ptr, <32 x i32>, i32, i20 } %14, 1
%17 = extractvalue { ptr, <32 x i32>, i32, i20 } %14, 2
%18 = extractvalue { ptr, <32 x i32>, i32, i20 } %14, 3
%19 = zext i20 %18 to i32
store i32 %19, ptr %count1, align 4
store <32 x i32> %16, ptr %s, align 128
store i32 %17, ptr %pos1.i.i, align 64
store ptr %15, ptr %p, align 4
ret void
}

define dso_local void @_Z26test_fifo_st_flush_3d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #1 {
; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRiiiS5_i:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv
; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128
; CHECK-NEXT: lda r26, [p1, dj0]; nopx
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: movs p4, p2
; CHECK-NEXT: nop
; CHECK-NEXT: vst.flush.512 [p2, sf, r26]
; CHECK-NEXT: nop
; CHECK-NEXT: st r26, [p1, dj0]
; CHECK-NEXT: vst sfl, [p1, #0]
; CHECK-NEXT: vst sfh, [p1, #64]
; CHECK-NEXT: st p2, [p0, #0]
; CHECK-NEXT: lda r26, [p1, dj0]
; CHECK-NEXT: vlda sfl, [p1, #0]
; CHECK-NEXT: lda dc0, [p4, #0]
; CHECK-NEXT: lda dc4, [p3, #0]
; CHECK-NEXT: vlda sfh, [p1, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: mov m0, r0
; CHECK-NEXT: movs dn0, r1; mov dj4, r4
; CHECK-NEXT: movs dn4, r3; mov dj0, r2
; CHECK-NEXT: vst.flush.512.3d [p2, sf, r26, d0]
; CHECK-NEXT: nop
; CHECK-NEXT: st dc0, [p4, #0]
; CHECK-NEXT: st dc4, [p3, #0]; ret lr
; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5
; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4
; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3
; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%pos1.i.i = getelementptr inbounds i8, ptr %s, i20 128
%0 = load ptr, ptr %p, align 4, !tbaa !7
%1 = load <32 x i32>, ptr %s, align 64
%2 = load i32, ptr %pos1.i.i, align 64
%3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.p0.p0(ptr %0, <32 x i32> %1, i32 %2)
%4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
%5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
%6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
store <32 x i32> %5, ptr %s, align 128
store i32 %6, ptr %pos1.i.i, align 64
store ptr %4, ptr %p, align 4
%7 = load <32 x i32>, ptr %s, align 64
%8 = load i32, ptr %pos1.i.i, align 64
%9 = trunc i32 %off to i20
%10 = trunc i32 %size1 to i20
%11 = load i32, ptr %count1, align 4
%12 = trunc i32 %11 to i20
%13 = trunc i32 %inc1 to i20
%14 = trunc i32 %size2 to i20
%15 = load i32, ptr %count2, align 4
%16 = trunc i32 %15 to i20
%17 = trunc i32 %inc2 to i20
%18 = tail call { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d.p0.p0(ptr %4, <32 x i32> %7, i32 %8, i20 %9, i20 %10, i20 %12, i20 %13, i20 %14, i20 %16, i20 %17)
%19 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %18, 0
%20 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %18, 1
%21 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %18, 2
%22 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %18, 3
%23 = zext i20 %22 to i32
%24 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %18, 4
%25 = zext i20 %24 to i32
store i32 %23, ptr %count1, align 4
store i32 %25, ptr %count2, align 4
store <32 x i32> %20, ptr %s, align 128
store i32 %21, ptr %pos1.i.i, align 64
store ptr %19, ptr %p, align 4
ret void
}

; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
declare <32 x i8> @llvm.aie2p.pack.I512.I8.I16(<32 x i16>, i32) #3

Expand Down