Skip to content

Commit 5363868

Browse files
Nicoshevmeta-codesync[bot]
authored andcommitted
Improve aarch64 crc32c on short-input
Summary: We are introducing a specialized crc32c routine for small inputs Similar to D95478675 but for crc32c Before: crc32c_8 2.78ns 359.63M crc32c_16 2.74ns 364.88M crc32c_32 2.75ns 363.37M crc32c_64 3.34ns 299.11M crc32c_128 5.77ns 173.31M crc32c_256 10.93ns 91.49M crc32c_384 15.80ns 63.31M crc32c_512 20.65ns 48.42M crc32c_1024 40.15ns 24.91M After: crc32c_8 2.65ns 377.68M crc32c_16 2.72ns 366.98M crc32c_32 2.72ns 367.79M crc32c_64 3.32ns 300.90M crc32c_128 4.53ns 220.59M crc32c_256 9.37ns 106.76M crc32c_384 14.20ns 70.43M crc32c_512 18.53ns 53.96M crc32c_1024 34.27ns 29.18M Reviewed By: yfeldblum Differential Revision: D95633651 fbshipit-source-id: 4b783de6478b0ec4069eff0055607e07559221f1
1 parent fe5a289 commit 5363868

3 files changed

Lines changed: 81 additions & 3 deletions

File tree

folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4e_s2x1.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ CRC_EXPORT uint32_t neon_eor3_crc32c_v8s2x4e_s2x1(const uint8_t*, size_t, uint32
2525
abort(); // not implemented on this platform
2626
}
2727

28+
CRC_EXPORT uint32_t neon_eor3_crc32c_small(const uint8_t*, size_t, uint32_t) {
29+
abort(); // not implemented on this platform
30+
}
31+
2832
CRC_EXPORT bool has_neon_eor3_crc32c_v8s2x4e_s2x1() {
2933
return false;
3034
}
@@ -85,6 +89,75 @@ CRC_EXPORT bool has_neon_eor3_crc32c_v8s2x4e_s2x1() {
8589
caps.aarch64_crc32() && caps.aarch64_sha3();
8690
}
8791

92+
// Mix v2s1x2 and s1x2
93+
CRC_EXPORT uint32_t neon_eor3_crc32c_small(const uint8_t* buf, size_t len, uint32_t crc0) {
94+
for (; len && ((uintptr_t)buf & 7); --len) {
95+
crc0 = __crc32cb(crc0, *buf++);
96+
}
97+
if (len > 384) {
98+
if (((uintptr_t)buf & 8) && len >= 8) {
99+
crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
100+
buf += 8;
101+
len -= 8;
102+
}
103+
size_t blk = (len - 0) / 48;
104+
size_t klen = blk * 16;
105+
const uint8_t* buf2 = buf + klen;
106+
uint64x2_t vc0;
107+
uint64_t vc;
108+
/* First vector chunk. */
109+
uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
110+
uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
111+
uint64x2_t k;
112+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
113+
buf2 += 32;
114+
len -= 48;
115+
/* Main loop. */
116+
while (len >= 48) {
117+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
118+
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
119+
x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
120+
x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
121+
crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
122+
crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
123+
buf += 16;
124+
buf2 += 32;
125+
len -= 48;
126+
}
127+
/* Reduce x0 ... x1 to just x0. */
128+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
129+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
130+
x0 = veor3q_u64(x0, y0, x1);
131+
/* Final scalar chunk. */
132+
crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
133+
crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
134+
vc0 = crc_shift(crc0, 0 + blk * 32);
135+
vc = vgetq_lane_u64(vc0, 0);
136+
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
137+
crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
138+
crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
139+
buf = buf2;
140+
}
141+
if (len >= 16) {
142+
/* Main loop. */
143+
do {
144+
crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
145+
crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
146+
buf += 16;
147+
len -= 16;
148+
} while (len >= 16);
149+
}
150+
if (len >= 8) {
151+
crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
152+
len -= 8;
153+
buf += 8;
154+
}
155+
for (; len; --len) {
156+
crc0 = __crc32cb(crc0, *buf++);
157+
}
158+
return crc0;
159+
}
160+
88161
CRC_EXPORT uint32_t neon_eor3_crc32c_v8s2x4e_s2x1(const uint8_t* buf, size_t len, uint32_t crc0) {
89162
for (; len && ((uintptr_t)buf & 7); --len) {
90163
crc0 = __crc32cb(crc0, *buf++);

folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4e_s2x1.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44

55
namespace folly::detail {
66
uint32_t neon_eor3_crc32c_v8s2x4e_s2x1(const uint8_t* buf, size_t len, uint32_t crc0);
7+
uint32_t neon_eor3_crc32c_small(const uint8_t* buf, size_t len, uint32_t crc0);
78
bool has_neon_eor3_crc32c_v8s2x4e_s2x1();
89
}

folly/hash/Checksum.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,13 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
221221
#endif
222222

223223
#if FOLLY_AARCH64
224-
if (nbytes >= 2048 && detail::crc32c_hw_supported_neon_eor3_sha3()) {
225-
return detail::neon_eor3_crc32c_v8s2x4e_s2x1(
226-
data, nbytes, startingChecksum);
224+
if (detail::crc32c_hw_supported_neon_eor3_sha3()) {
225+
if (nbytes < 1536) {
226+
return detail::neon_eor3_crc32c_small(data, nbytes, startingChecksum);
227+
} else {
228+
return detail::neon_eor3_crc32c_v8s2x4e_s2x1(
229+
data, nbytes, startingChecksum);
230+
}
227231
}
228232

229233
if (nbytes >= 4096 && detail::crc32c_hw_supported_neon()) {

0 commit comments

Comments
 (0)