-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Closed
Labels
Description
https://godbolt.org/z/a1PczEM8a
If we're selecting a subtraction with a non-constant we fold the select into an and:
#include <x86intrin.h>
auto masked_select(__m128i a, __m128i b, __m128i x, __m128i y) {
return _mm_blendv_epi8(a, _mm_sub_epi32(a, b), _mm_cmpgt_epi32(x,y));
}
masked_select(long long __vector(2), long long __vector(2), long long __vector(2), long long __vector(2)): # @masked_select(long long __vector(2), long long __vector(2), long long __vector(2), long long __vector(2))
pcmpgtd %xmm3, %xmm2
pand %xmm1, %xmm2
psubd %xmm2, %xmm0
retq
But for constants this fails, which on x86 can result in a BLENDV instruction, which is never faster than an AND
#include <x86intrin.h>
auto masked_select_const(__m128i a, __m128i x, __m128i y) {
__m128i b = _mm_set1_epi32(24);
return _mm_blendv_epi8(a, _mm_sub_epi32(a, b), _mm_cmpgt_epi32(x,y));
}
masked_select_const(long long __vector(2), long long __vector(2), long long __vector(2)): # @masked_select_const(long long __vector(2), long long __vector(2), long long __vector(2))
movdqa %xmm0, %xmm3
movdqa .LCPI3_0(%rip), %xmm4 # xmm4 = [4294967272,4294967272,4294967272,4294967272]
paddd %xmm0, %xmm4
pcmpgtd %xmm2, %xmm1
movdqa %xmm1, %xmm0
blendvps %xmm0, %xmm4, %xmm3
movaps %xmm3, %xmm0
retq