Skip to content

Avoid broadcast/extract when implementing memset #416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: aie-public
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 1 addition & 21 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8556,19 +8556,12 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
bool IsVolatile) {
auto &MF = *MI.getParent()->getParent();
const auto &TLI = *MF.getSubtarget().getTargetLowering();
auto &DL = MF.getDataLayout();
LLVMContext &C = MF.getFunction().getContext();

assert(KnownLen != 0 && "Have a zero length memset length!");

bool DstAlignCanChange = false;
MachineFrameInfo &MFI = MF.getFrameInfo();
const bool DstAlignCanChange = false;
bool OptSize = shouldLowerMemFuncForSize(MF);

MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
DstAlignCanChange = true;

unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
std::vector<LLT> MemOps;

Expand All @@ -8587,19 +8580,6 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
MF.getFunction().getAttributes(), TLI))
return UnableToLegalize;

if (DstAlignCanChange) {
// Get an estimate of the type from the LLT.
Type *IRTy = getTypeForLLT(MemOps[0], C);
Align NewAlign = DL.getABITypeAlign(IRTy);
if (NewAlign > Alignment) {
Alignment = NewAlign;
unsigned FI = FIDef->getOperand(1).getIndex();
// Give the stack frame object a larger alignment if needed.
if (MFI.getObjectAlign(FI) < Alignment)
MFI.setObjectAlignment(FI, Alignment);
}
}
Copy link
Collaborator Author

@martien-de-jong martien-de-jong Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. blunt fix. We cut away an optimization and don't meet it in PR testing.
However, this is not the way. Instead I propose we change the lowering code to get the appropriate alignment in the MMO's of the store instructions that are generated. We need to have a less lazy constructor of that derived MMO. I think that should be perfectly acceptable to upstream.


MachineIRBuilder MIB(MI);
// Find the largest store and generate the bit pattern for it.
LLT LargestTy = MemOps[0];
Expand Down
138 changes: 138 additions & 0 deletions llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-memset.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --start-after=irtranslator %s -o - | FileCheck %s

--- |
target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
target triple = "aie2p"

%struct.S = type { [14 x i32] }

define dso_local void @f() local_unnamed_addr #0 {
; CHECK-LABEL: f:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mova m0, #-60; nopb ; nops ; paddxm [sp], #64; nopv
; CHECK-NEXT: nopx ; mov p1, sp
; CHECK-NEXT: mova m0, #4; paddb [p1], m0; movx r0, #0; st lr, [sp, #-64] // 4-byte Folded Spill
; CHECK-NEXT: padda [p1], m0; st r0, [p1, #0]; mov p0, p1
; CHECK-NEXT: st r0, [p1], #4
; CHECK-NEXT: st r0, [p1], #4
; CHECK-NEXT: st r0, [p1], #4; jl #g
; CHECK-NEXT: st r0, [p1], #4 // Delay Slot 5
; CHECK-NEXT: st r0, [p1], #4 // Delay Slot 4
; CHECK-NEXT: st r0, [p1], #4 // Delay Slot 3
; CHECK-NEXT: st r0, [p1, #0] // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: lda lr, [sp, #-64]; nopb ; nops ; nopxm ; nopv // 4-byte Folded Reload
; CHECK-NEXT: nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%s = alloca %struct.S, align 4
call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %s) #4
call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(56) %s, i8 0, i64 56, i1 false)
call void @g(ptr nonnull %s) #4
call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %s) #4
ret void
}

declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
declare dso_local void @g(ptr) local_unnamed_addr #2
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3

attributes #0 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #4 = { nounwind }

...
---
name: f
alignment: 16
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
callsEHReturn: false
callsUnwindInit: false
hasEHCatchret: false
hasEHScopes: false
hasEHFunclets: false
isOutlined: false
debugInstrRef: false
failsVerification: false
tracksDebugUserValues: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
- { id: 2, class: _, preferred-register: '' }
- { id: 3, class: _, preferred-register: '' }
liveins: []
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
functionContext: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
hasTailCall: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack:
- { id: 0, name: s, type: default, offset: 0, size: 56, alignment: 4,
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
entry_values: []
callSites: []
debugValueSubstitutions: []
constants: []
machineFunctionInfo:
varArgsFrameIndex: 0
body: |
bb.1.entry:
%1:_(s8) = G_CONSTANT i8 0
%2:_(s64) = G_CONSTANT i64 32
%0:_(p0) = G_FRAME_INDEX %stack.0.s
LIFETIME_START %stack.0.s
%3:_(s20) = G_TRUNC %2(s64)
G_MEMSET %0(p0), %1(s8), %3(s20), 0 :: (store (s8) into %ir.s, align 4)
ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
$p0 = COPY %0(p0)
PseudoJL @g, csr_aie2p, implicit-def $lr, implicit $p0
ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
LIFETIME_END %stack.0.s
PseudoRET implicit $lr

...