diff --git a/site/source/docs/porting/simd.rst b/site/source/docs/porting/simd.rst index 510654b215f48..c986a7f3efe60 100644 --- a/site/source/docs/porting/simd.rst +++ b/site/source/docs/porting/simd.rst @@ -500,7 +500,7 @@ The following table highlights the availability and expected performance of diff * - _mm_loadu_si32 - ❌ emulated with wasm_i32x4_make * - _mm_madd_epi16 - - ❌ scalarized + - ✅ wasm_dot_s_i32x4_i16x8 * - _mm_maskmoveu_si128 - ❌ scalarized * - _mm_max_epi16 diff --git a/system/include/compat/emmintrin.h b/system/include/compat/emmintrin.h index d0cef4a4a7f7d..d94d6c4594adb 100644 --- a/system/include/compat/emmintrin.h +++ b/system/include/compat/emmintrin.h @@ -669,20 +669,7 @@ _mm_avg_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_madd_epi16(__m128i __a, __m128i __b) { - // TODO: optimize - union { - signed short x[8]; - __m128i m; - } src, src2; - union { - signed int x[4]; - __m128i m; - } dst; - src.m = __a; - src2.m = __b; - for(int i = 0; i < 4; ++i) - dst.x[i] = src.x[i*2] * src2.x[i*2] + src.x[i*2+1] * src2.x[i*2+1]; - return dst.m; + return (__m128i)__builtin_wasm_dot_s_i32x4_i16x8((__i16x8)__a, (__i16x8)__b); } static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))