Skip to content

Commit 3bbe27e

Browse files
sstricklCommit Queue
authored and
Commit Queue
committed
[vm/compiler] Perform word-sized copies in MemoryMove when possible.
When copying elements that are smaller than words on non-X86 architectures, only copy element by element until the remaining elements can be copied in word-sized chunks, and then do so. TEST=vm/cc/IRTest_Memory, co19{,_2}/LibTest/typed_data, lib{,_2}/typed_data, corelib{,_2}/list_test Issue: #42072 Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-simarm_x64-try,vm-aot-linux-debug-simriscv64-try,vm-linux-debug-simriscv64-try,vm-mac-debug-arm64-try,vm-aot-linux-release-simarm64-try,vm-aot-linux-release-simarm_x64-try,vm-aot-mac-release-arm64-try,vm-ffi-qemu-linux-release-riscv64-try,vm-ffi-qemu-linux-release-arm-try,vm-linux-release-simarm-try,vm-linux-release-simarm64-try,vm-mac-release-arm64-try,vm-aot-android-release-arm64c-try,vm-ffi-android-debug-arm64c-try Change-Id: I61eab310b92a6bc5ebd88fa63d562103d887cb74 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/324280 Reviewed-by: Alexander Markov <[email protected]> Commit-Queue: Tess Strickland <[email protected]> Reviewed-by: Ryan Macnak <[email protected]>
1 parent 622f407 commit 3bbe27e

File tree

7 files changed

+371
-142
lines changed

7 files changed

+371
-142
lines changed

runtime/vm/compiler/assembler/assembler_arm.cc

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2821,16 +2821,29 @@ Address Assembler::PrepareLargeLoadOffset(const Address& address,
28212821
if (address.kind() != Address::Immediate) {
28222822
return address;
28232823
}
2824-
Register base = address.base();
28252824
int32_t offset = address.offset();
28262825
int32_t offset_mask = 0;
2827-
if (!Address::CanHoldLoadOffset(size, offset, &offset_mask)) {
2828-
ASSERT(base != IP);
2829-
AddImmediate(IP, base, offset & ~offset_mask, cond);
2830-
base = IP;
2831-
offset = offset & offset_mask;
2826+
if (Address::CanHoldLoadOffset(size, offset, &offset_mask)) {
2827+
return address;
28322828
}
2833-
return Address(base, offset);
2829+
auto mode = address.mode();
2830+
// If the retrieved offset is negative, then the U bit was flipped during
2831+
// encoding, so re-flip it.
2832+
if (offset < 0) {
2833+
mode = static_cast<Address::Mode>(mode ^ U);
2834+
}
2835+
// If writing back post-indexing, we can't separate the instruction into
2836+
// two parts and the offset must fit.
2837+
ASSERT((mode | U) != Address::PostIndex);
2838+
// If we're writing back pre-indexing, we must add directly to the base,
2839+
// otherwise we use TMP.
2840+
Register base = address.base();
2841+
ASSERT(base != TMP || address.has_writeback());
2842+
Register temp = address.has_writeback() ? base : TMP;
2843+
AddImmediate(temp, base, offset & ~offset_mask, cond);
2844+
base = temp;
2845+
offset = offset & offset_mask;
2846+
return Address(base, offset, mode);
28342847
}
28352848

28362849
Address Assembler::PrepareLargeStoreOffset(const Address& address,
@@ -2840,16 +2853,29 @@ Address Assembler::PrepareLargeStoreOffset(const Address& address,
28402853
if (address.kind() != Address::Immediate) {
28412854
return address;
28422855
}
2843-
Register base = address.base();
28442856
int32_t offset = address.offset();
28452857
int32_t offset_mask = 0;
2846-
if (!Address::CanHoldStoreOffset(size, offset, &offset_mask)) {
2847-
ASSERT(base != IP);
2848-
AddImmediate(IP, base, offset & ~offset_mask, cond);
2849-
base = IP;
2850-
offset = offset & offset_mask;
2858+
if (Address::CanHoldStoreOffset(size, offset, &offset_mask)) {
2859+
return address;
28512860
}
2852-
return Address(base, offset);
2861+
auto mode = address.mode();
2862+
// If the retrieved offset is negative, then the U bit was flipped during
2863+
// encoding, so re-flip it.
2864+
if (offset < 0) {
2865+
mode = static_cast<Address::Mode>(mode ^ U);
2866+
}
2867+
// If writing back post-indexing, we can't separate the instruction into
2868+
// two parts and the offset must fit.
2869+
ASSERT((mode | U) != Address::PostIndex);
2870+
// If we're writing back pre-indexing, we must add directly to the base,
2871+
// otherwise we use TMP.
2872+
Register base = address.base();
2873+
ASSERT(base != TMP || address.has_writeback());
2874+
Register temp = address.has_writeback() ? base : TMP;
2875+
AddImmediate(temp, base, offset & ~offset_mask, cond);
2876+
base = temp;
2877+
offset = offset & offset_mask;
2878+
return Address(base, offset, mode);
28532879
}
28542880

28552881
void Assembler::LoadFromOffset(Register reg,

runtime/vm/compiler/assembler/assembler_arm.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,9 @@ class Address : public ValueObject {
257257
kind_ = Immediate;
258258
base_ = rn;
259259
offset_ = offset;
260+
// If the offset can't be encoded in fewer bits, then it'll conflict with
261+
// the encoding of the mode and we won't be able to retrieve it later.
262+
ASSERT(Utils::MagnitudeIsUint(kOpcodeShift, offset));
260263
if (offset < 0) {
261264
encoding_ = (am ^ (1 << kUShift)) | -offset; // Flip U to adjust sign.
262265
} else {
@@ -310,7 +313,7 @@ class Address : public ValueObject {
310313
: kNoRegister;
311314
}
312315

313-
Mode mode() const { return static_cast<Mode>(encoding() & kModeMask); }
316+
Mode mode() const { return static_cast<Mode>(encoding_ & kModeMask); }
314317

315318
bool has_writeback() const {
316319
return (mode() == PreIndex) || (mode() == PostIndex) ||

runtime/vm/compiler/backend/il.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6775,6 +6775,7 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
67756775
// the loop case.
67766776
__ BranchIf(UNSIGNED_LESS_EQUAL, &copy_forwards,
67776777
compiler::Assembler::kNearJump);
6778+
__ Comment("Copying backwards");
67786779
if (constant_length) {
67796780
EmitUnrolledCopy(compiler, dest_reg, src_reg, num_elements,
67806781
/*reversed=*/true);
@@ -6783,6 +6784,7 @@ void MemoryCopyInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
67836784
&copy_forwards);
67846785
}
67856786
__ Jump(&done, compiler::Assembler::kNearJump);
6787+
__ Comment("Copying forwards");
67866788
}
67876789
__ Bind(&copy_forwards);
67886790
if (constant_length) {

runtime/vm/compiler/backend/il_arm.cc

Lines changed: 86 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -243,33 +243,88 @@ void MemoryCopyInstr::PrepareLengthRegForLoop(FlowGraphCompiler* compiler,
243243
__ BranchIfZero(length_reg, done);
244244
}
245245

246+
static compiler::OperandSize OperandSizeFor(intptr_t bytes) {
247+
ASSERT(Utils::IsPowerOfTwo(bytes));
248+
switch (bytes) {
249+
case 1:
250+
return compiler::kUnsignedByte;
251+
case 2:
252+
return compiler::kUnsignedTwoBytes;
253+
case 4:
254+
return compiler::kUnsignedFourBytes;
255+
case 8:
256+
return compiler::kEightBytes;
257+
default:
258+
UNREACHABLE();
259+
return compiler::kEightBytes;
260+
}
261+
}
262+
263+
static void CopyUpToWordMultiple(FlowGraphCompiler* compiler,
264+
Register dest_reg,
265+
Register src_reg,
266+
Register length_reg,
267+
intptr_t element_size,
268+
bool unboxed_inputs,
269+
bool reversed,
270+
compiler::Label* done) {
271+
ASSERT(Utils::IsPowerOfTwo(element_size));
272+
if (element_size >= compiler::target::kWordSize) return;
273+
274+
const intptr_t base_shift = (unboxed_inputs ? 0 : kSmiTagShift) -
275+
Utils::ShiftForPowerOfTwo(element_size);
276+
auto const mode =
277+
reversed ? compiler::Address::NegPreIndex : compiler::Address::PostIndex;
278+
intptr_t tested_bits = 0;
279+
280+
__ Comment("Copying until region is a multiple of word size");
281+
282+
for (intptr_t bit = compiler::target::kWordSizeLog2 - 1; bit >= 0; bit--) {
283+
const intptr_t bytes = 1 << bit;
284+
if (element_size > bytes) continue;
285+
auto const sz = OperandSizeFor(bytes);
286+
const intptr_t tested_bit = bit + base_shift;
287+
tested_bits |= (1 << tested_bit);
288+
__ tst(length_reg, compiler::Operand(1 << tested_bit));
289+
__ LoadFromOffset(TMP, compiler::Address(src_reg, bytes, mode), sz,
290+
NOT_ZERO);
291+
__ StoreToOffset(TMP, compiler::Address(dest_reg, bytes, mode), sz,
292+
NOT_ZERO);
293+
}
294+
295+
__ bics(length_reg, length_reg, compiler::Operand(tested_bits));
296+
__ b(done, ZERO);
297+
}
298+
246299
void MemoryCopyInstr::EmitLoopCopy(FlowGraphCompiler* compiler,
247300
Register dest_reg,
248301
Register src_reg,
249302
Register length_reg,
250303
compiler::Label* done,
251304
compiler::Label* copy_forwards) {
252-
const intptr_t loop_subtract = unboxed_inputs() ? 1 : Smi::RawValue(1);
253-
auto load_mode = compiler::Address::PostIndex;
254-
auto load_multiple_mode = BlockAddressMode::IA_W;
255-
if (copy_forwards != nullptr) {
256-
// When reversed, start the src and dest registers with the end addresses
257-
// and apply the negated offset prior to indexing.
258-
load_mode = compiler::Address::NegPreIndex;
259-
load_multiple_mode = BlockAddressMode::DB_W;
305+
const bool reversed = copy_forwards != nullptr;
306+
if (reversed) {
260307
// Verify that the overlap actually exists by checking to see if
261308
// dest_start < src_end.
262309
const intptr_t shift = Utils::ShiftForPowerOfTwo(element_size_) -
263310
(unboxed_inputs() ? 0 : kSmiTagShift);
264311
if (shift < 0) {
265-
__ add(TMP, src_reg, compiler::Operand(length_reg, ASR, -shift));
312+
__ add(src_reg, src_reg, compiler::Operand(length_reg, ASR, -shift));
266313
} else {
267-
__ add(TMP, src_reg, compiler::Operand(length_reg, LSL, shift));
314+
__ add(src_reg, src_reg, compiler::Operand(length_reg, LSL, shift));
268315
}
269-
__ CompareRegisters(dest_reg, TMP);
270-
__ BranchIf(UNSIGNED_GREATER_EQUAL, copy_forwards);
271-
// There is overlap, so mov TMP to src_reg and adjust dest_reg now.
272-
__ MoveRegister(src_reg, TMP);
316+
__ CompareRegisters(dest_reg, src_reg);
317+
// If dest_reg >= src_reg, then set src_reg back to the start of the source
318+
// region before branching to the forwards-copying loop.
319+
if (shift < 0) {
320+
__ sub(src_reg, src_reg, compiler::Operand(length_reg, ASR, -shift),
321+
UNSIGNED_GREATER_EQUAL);
322+
} else {
323+
__ sub(src_reg, src_reg, compiler::Operand(length_reg, LSL, shift),
324+
UNSIGNED_GREATER_EQUAL);
325+
}
326+
__ b(copy_forwards, UNSIGNED_GREATER_EQUAL);
327+
// There is overlap, so adjust dest_reg now.
273328
if (shift < 0) {
274329
__ add(dest_reg, dest_reg, compiler::Operand(length_reg, ASR, -shift));
275330
} else {
@@ -279,29 +334,34 @@ void MemoryCopyInstr::EmitLoopCopy(FlowGraphCompiler* compiler,
279334
// We can use TMP for all instructions below because element_size_ is
280335
// guaranteed to fit in the offset portion of the instruction in the
281336
// non-LDM/STM cases.
282-
compiler::Address src_address =
283-
compiler::Address(src_reg, element_size_, load_mode);
284-
compiler::Address dest_address =
285-
compiler::Address(dest_reg, element_size_, load_mode);
337+
CopyUpToWordMultiple(compiler, dest_reg, src_reg, length_reg, element_size_,
338+
unboxed_inputs_, reversed, done);
339+
// When reversed, the src and dest registers have been adjusted to start at
340+
// the end addresses, so apply the negated offset prior to indexing.
341+
const auto load_mode =
342+
reversed ? compiler::Address::NegPreIndex : compiler::Address::PostIndex;
343+
const auto load_multiple_mode =
344+
reversed ? BlockAddressMode::DB_W : BlockAddressMode::IA_W;
345+
// The size of the uncopied region is a multiple of the word size, so now we
346+
// copy the rest by word (unless the element size is larger).
347+
const intptr_t loop_subtract =
348+
Utils::Maximum<intptr_t>(1, compiler::target::kWordSize / element_size_)
349+
<< (unboxed_inputs_ ? 0 : kSmiTagShift);
286350
// Used only for LDM/STM below.
287351
RegList temp_regs = (1 << TMP);
288352
for (intptr_t i = 0; i < locs()->temp_count(); i++) {
289353
temp_regs |= 1 << locs()->temp(i).reg();
290354
}
355+
__ Comment("Copying by multiples of word size");
291356
compiler::Label loop;
292357
__ Bind(&loop);
293358
switch (element_size_) {
359+
// Fall through for the sizes smaller than compiler::target::kWordSize.
294360
case 1:
295-
__ ldrb(TMP, src_address);
296-
__ strb(TMP, dest_address);
297-
break;
298361
case 2:
299-
__ ldrh(TMP, src_address);
300-
__ strh(TMP, dest_address);
301-
break;
302362
case 4:
303-
__ ldr(TMP, src_address);
304-
__ str(TMP, dest_address);
363+
__ ldr(TMP, compiler::Address(src_reg, 4, load_mode));
364+
__ str(TMP, compiler::Address(dest_reg, 4, load_mode));
305365
break;
306366
case 8:
307367
COMPILE_ASSERT(8 == kMaxMemoryCopyElementSize);

runtime/vm/compiler/backend/il_arm64.cc

Lines changed: 80 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -175,22 +175,71 @@ void MemoryCopyInstr::PrepareLengthRegForLoop(FlowGraphCompiler* compiler,
175175
__ BranchIfZero(length_reg, done);
176176
}
177177

178+
static compiler::OperandSize OperandSizeFor(intptr_t bytes) {
179+
ASSERT(Utils::IsPowerOfTwo(bytes));
180+
switch (bytes) {
181+
case 1:
182+
return compiler::kUnsignedByte;
183+
case 2:
184+
return compiler::kUnsignedTwoBytes;
185+
case 4:
186+
return compiler::kUnsignedFourBytes;
187+
case 8:
188+
return compiler::kEightBytes;
189+
default:
190+
UNREACHABLE();
191+
return compiler::kEightBytes;
192+
}
193+
}
194+
195+
static void CopyUpToWordMultiple(FlowGraphCompiler* compiler,
196+
Register dest_reg,
197+
Register src_reg,
198+
Register length_reg,
199+
intptr_t element_size,
200+
bool unboxed_inputs,
201+
bool reversed,
202+
compiler::Label* done) {
203+
ASSERT(Utils::IsPowerOfTwo(element_size));
204+
if (element_size >= compiler::target::kWordSize) return;
205+
206+
const intptr_t base_shift = (unboxed_inputs ? 0 : kSmiTagShift) -
207+
Utils::ShiftForPowerOfTwo(element_size);
208+
const intptr_t offset_sign = reversed ? -1 : 1;
209+
auto const mode =
210+
reversed ? compiler::Address::PreIndex : compiler::Address::PostIndex;
211+
intptr_t tested_bits = 0;
212+
213+
__ Comment("Copying until region is a multiple of word size");
214+
215+
for (intptr_t bit = compiler::target::kWordSizeLog2 - 1; bit >= 0; bit--) {
216+
const intptr_t bytes = 1 << bit;
217+
if (element_size > bytes) continue;
218+
auto const sz = OperandSizeFor(bytes);
219+
const intptr_t tested_bit = bit + base_shift;
220+
tested_bits |= (1 << tested_bit);
221+
const intptr_t offset = offset_sign * bytes;
222+
compiler::Label skip_copy;
223+
__ tbz(&skip_copy, length_reg, tested_bit);
224+
__ ldr(TMP, compiler::Address(src_reg, offset, mode), sz);
225+
__ str(TMP, compiler::Address(dest_reg, offset, mode), sz);
226+
__ Bind(&skip_copy);
227+
}
228+
229+
ASSERT(tested_bits != 0);
230+
__ andis(length_reg, length_reg, compiler::Immediate(~tested_bits),
231+
compiler::kObjectBytes);
232+
__ b(done, ZERO);
233+
}
234+
178235
void MemoryCopyInstr::EmitLoopCopy(FlowGraphCompiler* compiler,
179236
Register dest_reg,
180237
Register src_reg,
181238
Register length_reg,
182239
compiler::Label* done,
183240
compiler::Label* copy_forwards) {
184-
const intptr_t loop_subtract = unboxed_inputs() ? 1 : Smi::RawValue(1);
185-
intptr_t offset = element_size_;
186-
auto mode = element_size_ == 16 ? compiler::Address::PairPostIndex
187-
: compiler::Address::PostIndex;
188-
if (copy_forwards != nullptr) {
189-
// When reversed, start the src and dest registers with the end addresses
190-
// and apply the negated offset prior to indexing.
191-
offset = -element_size_;
192-
mode = element_size_ == 16 ? compiler::Address::PairPreIndex
193-
: compiler::Address::PreIndex;
241+
const bool reversed = copy_forwards != nullptr;
242+
if (reversed) {
194243
// Verify that the overlap actually exists by checking to see if
195244
// dest_start < src_end.
196245
if (!unboxed_inputs()) {
@@ -213,24 +262,33 @@ void MemoryCopyInstr::EmitLoopCopy(FlowGraphCompiler* compiler,
213262
__ add(dest_reg, dest_reg, compiler::Operand(length_reg, LSL, shift));
214263
}
215264
}
216-
217-
compiler::Address src_address = compiler::Address(src_reg, offset, mode);
218-
compiler::Address dest_address = compiler::Address(dest_reg, offset, mode);
265+
CopyUpToWordMultiple(compiler, dest_reg, src_reg, length_reg, element_size_,
266+
unboxed_inputs_, reversed, done);
267+
// When reversed, the src and dest registers are adjusted to start with the
268+
// end addresses, so apply the negated offset prior to indexing.
269+
const intptr_t offset =
270+
(reversed ? -1 : 1) *
271+
Utils::Maximum<intptr_t>(compiler::target::kWordSize, element_size_);
272+
const auto mode = element_size_ == 16
273+
? (reversed ? compiler::Address::PairPreIndex
274+
: compiler::Address::PairPostIndex)
275+
: (reversed ? compiler::Address::PreIndex
276+
: compiler::Address::PostIndex);
277+
// The size of the uncopied region is a multiple of the word size, so now we
278+
// copy the rest by word (unless the element size is larger).
279+
const intptr_t loop_subtract =
280+
Utils::Maximum<intptr_t>(1, compiler::target::kWordSize / element_size_)
281+
<< (unboxed_inputs_ ? 0 : kSmiTagShift);
282+
const auto src_address = compiler::Address(src_reg, offset, mode);
283+
const auto dest_address = compiler::Address(dest_reg, offset, mode);
284+
__ Comment("Copying by multiples of word size");
219285
compiler::Label loop;
220286
__ Bind(&loop);
221287
switch (element_size_) {
288+
// Fall through for the sizes smaller than compiler::target::kWordSize.
222289
case 1:
223-
__ ldr(TMP, src_address, compiler::kUnsignedByte);
224-
__ str(TMP, dest_address, compiler::kUnsignedByte);
225-
break;
226290
case 2:
227-
__ ldr(TMP, src_address, compiler::kUnsignedTwoBytes);
228-
__ str(TMP, dest_address, compiler::kUnsignedTwoBytes);
229-
break;
230291
case 4:
231-
__ ldr(TMP, src_address, compiler::kUnsignedFourBytes);
232-
__ str(TMP, dest_address, compiler::kUnsignedFourBytes);
233-
break;
234292
case 8:
235293
__ ldr(TMP, src_address, compiler::kEightBytes);
236294
__ str(TMP, dest_address, compiler::kEightBytes);
@@ -243,7 +301,6 @@ void MemoryCopyInstr::EmitLoopCopy(FlowGraphCompiler* compiler,
243301
UNREACHABLE();
244302
break;
245303
}
246-
247304
__ subs(length_reg, length_reg, compiler::Operand(loop_subtract),
248305
compiler::kObjectBytes);
249306
__ b(&loop, NOT_ZERO);

0 commit comments

Comments
 (0)