Skip to content

Commit f74e9f8

Browse files
authored
[Aarch64] Materialize immediates with 64-bit ORR + EOR if shorter (#68287)
A number of useful constants can be encoded with a 64-bit ORR followed by a 64-bit EOR, including all remaining repeated byte patterns, some useful repeated 16-bit patterns, and some irregular masks. This patch prioritizes that encoding over three or four instruction encodings. Encoding with MOV + MOVK or ORR + MOVK is still preferred for fast literal generation and readability respectively. The method devises three candidate values, and checks if both Candidate and (Imm ^ Candidate) are valid logical immediates. If so, Imm is materialized with: ``` ORR Xd, XZR, #(Imm ^ Candidate) EOR Xd, Xd, #(Candidate) ``` The method has been exhaustively tested to ensure it can solve all possible values (excluding 0, ~0, and plain logical immediates, which are handled earlier).
1 parent a2b8c49 commit f74e9f8

File tree

3 files changed

+221
-0
lines changed

3 files changed

+221
-0
lines changed

llvm/lib/Target/AArch64/AArch64ExpandImm.cpp

+103
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,105 @@ static bool tryAndOfLogicalImmediates(uint64_t UImm,
362362
return false;
363363
}
364364

365+
// Check whether the constant can be represented by exclusive-or of two 64-bit
366+
// logical immediates. If so, materialize it with an ORR instruction followed
367+
// by an EOR instruction.
368+
//
369+
// This encoding allows all remaining repeated byte patterns, and many repeated
370+
// 16-bit values, to be encoded without needing four instructions. It can also
371+
// represent some irregular bitmasks (although those would mostly only need
372+
// three instructions otherwise).
373+
static bool tryEorOfLogicalImmediates(uint64_t Imm,
374+
SmallVectorImpl<ImmInsnModel> &Insn) {
375+
// Determine the larger repetition size of the two possible logical
376+
// immediates, by finding the repetition size of Imm.
377+
unsigned BigSize = 64;
378+
379+
do {
380+
BigSize /= 2;
381+
uint64_t Mask = (1ULL << BigSize) - 1;
382+
383+
if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) {
384+
BigSize *= 2;
385+
break;
386+
}
387+
} while (BigSize > 2);
388+
389+
uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize);
390+
391+
// Find the last bit of each run of ones, circularly. For runs which wrap
392+
// around from bit 0 to bit 63, this is the bit before the most-significant
393+
// zero, otherwise it is the least-significant bit in the run of ones.
394+
uint64_t RunStarts = Imm & ~rotl<uint64_t>(Imm, 1);
395+
396+
// Find the smaller repetition size of the two possible logical immediates by
397+
// counting the number of runs of one-bits within the BigSize-bit value. Both
398+
// sizes may be the same. The EOR may add one or subtract one from the
399+
// power-of-two count that can be represented by a logical immediate, or it
400+
// may be left unchanged.
401+
int RunsPerBigChunk = popcount(RunStarts & BigMask);
402+
403+
static const int8_t BigToSmallSizeTable[32] = {
404+
-1, -1, 0, 1, 2, 2, -1, 3, 3, 3, -1, -1, -1, -1, -1, 4,
405+
4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5,
406+
};
407+
408+
int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk];
409+
410+
// Early-exit if the big chunk couldn't be a power-of-two number of runs
411+
// EORed with another single run.
412+
if (BigToSmallShift == -1)
413+
return false;
414+
415+
unsigned SmallSize = BigSize >> BigToSmallShift;
416+
417+
// 64-bit values with a bit set every (1 << index) bits.
418+
static const uint64_t RepeatedOnesTable[] = {
419+
0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111,
420+
0x0101010101010101, 0x0001000100010001, 0x0000000100000001,
421+
0x0000000000000001,
422+
};
423+
424+
// This RepeatedOnesTable lookup is a faster implementation of the division
425+
// 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as
426+
// dividing the 64-bit value into fields of width SmallSize, and placing a
427+
// one in the least significant bit of each field.
428+
uint64_t SmallOnes = RepeatedOnesTable[countr_zero(SmallSize)];
429+
430+
// Now we try to find the number of ones in each of the smaller repetitions,
431+
// by looking at runs of ones in Imm. This can take three attempts, as the
432+
// EOR may have changed the length of the first two runs we find.
433+
434+
// Rotate a run of ones so we can count the number of trailing set bits.
435+
int Rotation = countr_zero(RunStarts);
436+
uint64_t RotatedImm = rotr<uint64_t>(Imm, Rotation);
437+
for (int Attempt = 0; Attempt < 3; ++Attempt) {
438+
unsigned RunLength = countr_one(RotatedImm);
439+
440+
// Construct candidate values BigImm and SmallImm, such that if these two
441+
// values are encodable, we have a solution. (SmallImm is constructed to be
442+
// encodable, but this isn't guaranteed when RunLength >= SmallSize)
443+
uint64_t SmallImm =
444+
rotl<uint64_t>((SmallOnes << RunLength) - SmallOnes, Rotation);
445+
uint64_t BigImm = Imm ^ SmallImm;
446+
447+
uint64_t BigEncoding = 0;
448+
uint64_t SmallEncoding = 0;
449+
if (AArch64_AM::processLogicalImmediate(BigImm, 64, BigEncoding) &&
450+
AArch64_AM::processLogicalImmediate(SmallImm, 64, SmallEncoding)) {
451+
Insn.push_back({AArch64::ORRXri, 0, SmallEncoding});
452+
Insn.push_back({AArch64::EORXri, 1, BigEncoding});
453+
return true;
454+
}
455+
456+
// Rotate to the next run of ones
457+
Rotation += countr_zero(rotr<uint64_t>(RunStarts, Rotation) & ~1);
458+
RotatedImm = rotr<uint64_t>(Imm, Rotation);
459+
}
460+
461+
return false;
462+
}
463+
365464
/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
366465
/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
367466
static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
@@ -503,6 +602,10 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
503602
if (tryAndOfLogicalImmediates(Imm, Insn))
504603
return;
505604

605+
// Attempt to use a sequence of ORR-immediate followed by EOR-immediate.
606+
if (tryEorOfLogicalImmediates(UImm, Insn))
607+
return;
608+
506609
// FIXME: Add more two-instruction sequences.
507610

508611
// Three instruction sequences.

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
171171
}
172172
break;
173173
case AArch64::ANDXri:
174+
case AArch64::EORXri:
174175
if (I->Op1 == 0) {
175176
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
176177
.add(MI.getOperand(0))

llvm/test/CodeGen/AArch64/arm64-movi.ll

+117
Original file line numberDiff line numberDiff line change
@@ -432,3 +432,120 @@ define i64 @orr_64_orr_8() nounwind {
432432
; CHECK-NEXT: ret
433433
ret i64 -5764607889538110806
434434
}
435+
436+
define i64 @orr_2_eor_16() nounwind {
437+
; CHECK-LABEL: orr_2_eor_16:
438+
; CHECK: // %bb.0:
439+
; CHECK-NEXT: mov x0, #6148914691236517205
440+
; CHECK-NEXT: eor x0, x0, #0x3000300030003000
441+
; CHECK-NEXT: ret
442+
ret i64 7301853788297848149
443+
}
444+
445+
define i64 @orr_2_eor_32() nounwind {
446+
; CHECK-LABEL: orr_2_eor_32:
447+
; CHECK: // %bb.0:
448+
; CHECK-NEXT: mov x0, #6148914691236517205
449+
; CHECK-NEXT: eor x0, x0, #0x1fffc0001fffc0
450+
; CHECK-NEXT: ret
451+
ret i64 6145912199858268821
452+
}
453+
454+
define i64 @orr_2_eor_64() nounwind {
455+
; CHECK-LABEL: orr_2_eor_64:
456+
; CHECK: // %bb.0:
457+
; CHECK-NEXT: mov x0, #6148914691236517205
458+
; CHECK-NEXT: eor x0, x0, #0x1fffffffffc00
459+
; CHECK-NEXT: ret
460+
ret i64 6148727041252043093
461+
}
462+
463+
define i64 @orr_4_eor_8() nounwind {
464+
; CHECK-LABEL: orr_4_eor_8:
465+
; CHECK: // %bb.0:
466+
; CHECK-NEXT: mov x0, #2459565876494606882
467+
; CHECK-NEXT: eor x0, x0, #0x8f8f8f8f8f8f8f8f
468+
; CHECK-NEXT: ret
469+
ret i64 12514849900987264429
470+
}
471+
472+
define i64 @orr_4_eor_16() nounwind {
473+
; CHECK-LABEL: orr_4_eor_16:
474+
; CHECK: // %bb.0:
475+
; CHECK-NEXT: mov x0, #4919131752989213764
476+
; CHECK-NEXT: eor x0, x0, #0xf00ff00ff00ff00f
477+
; CHECK-NEXT: ret
478+
ret i64 12991675787320734795
479+
}
480+
481+
define i64 @orr_4_eor_32() nounwind {
482+
; CHECK-LABEL: orr_4_eor_32:
483+
; CHECK: // %bb.0:
484+
; CHECK-NEXT: mov x0, #4919131752989213764
485+
; CHECK-NEXT: eor x0, x0, #0x1ff800001ff80000
486+
; CHECK-NEXT: ret
487+
ret i64 6610233413460575300
488+
}
489+
490+
define i64 @orr_4_eor_64() nounwind {
491+
; CHECK-LABEL: orr_4_eor_64:
492+
; CHECK: // %bb.0:
493+
; CHECK-NEXT: mov x0, #1229782938247303441
494+
; CHECK-NEXT: eor x0, x0, #0xfff80000000
495+
; CHECK-NEXT: ret
496+
ret i64 1229798183233720593
497+
}
498+
499+
define i64 @orr_8_eor_16() nounwind {
500+
; CHECK-LABEL: orr_8_eor_16:
501+
; CHECK: // %bb.0:
502+
; CHECK-NEXT: mov x0, #3472328296227680304
503+
; CHECK-NEXT: eor x0, x0, #0x1f801f801f801f80
504+
; CHECK-NEXT: ret
505+
ret i64 3436298949444513712
506+
}
507+
508+
define i64 @orr_8_eor_32() nounwind {
509+
; CHECK-LABEL: orr_8_eor_32:
510+
; CHECK: // %bb.0:
511+
; CHECK-NEXT: mov x0, #1157442765409226768
512+
; CHECK-NEXT: eor x0, x0, #0xffff8001ffff8001
513+
; CHECK-NEXT: ret
514+
ret i64 17289195901212921873
515+
}
516+
517+
define i64 @orr_8_eor_64() nounwind {
518+
; CHECK-LABEL: orr_8_eor_64:
519+
; CHECK: // %bb.0:
520+
; CHECK-NEXT: mov x0, #3472328296227680304
521+
; CHECK-NEXT: eor x0, x0, #0x3ffffffff00000
522+
; CHECK-NEXT: ret
523+
ret i64 3463215129921859632
524+
}
525+
526+
define i64 @orr_16_eor_32() nounwind {
527+
; CHECK-LABEL: orr_16_eor_32:
528+
; CHECK: // %bb.0:
529+
; CHECK-NEXT: mov x0, #1143931760365539296
530+
; CHECK-NEXT: eor x0, x0, #0xffff0001ffff0001
531+
; CHECK-NEXT: ret
532+
ret i64 17302565756451360737
533+
}
534+
535+
define i64 @orr_16_eor_64() nounwind {
536+
; CHECK-LABEL: orr_16_eor_64:
537+
; CHECK: // %bb.0:
538+
; CHECK-NEXT: mov x0, #9214505439794855904
539+
; CHECK-NEXT: eor x0, x0, #0xfe000
540+
; CHECK-NEXT: ret
541+
ret i64 9214505439795847136
542+
}
543+
544+
define i64 @orr_32_eor_64() nounwind {
545+
; CHECK-LABEL: orr_32_eor_64:
546+
; CHECK: // %bb.0:
547+
; CHECK-NEXT: mov x0, #1030792151280
548+
; CHECK-NEXT: eor x0, x0, #0xffff8000003fffff
549+
; CHECK-NEXT: ret
550+
ret i64 18446604367017541391
551+
}

0 commit comments

Comments
 (0)