x86: merge SSE2 and AVX2 helpers

sekrit-twc · sekrit-twc · commit 7240030a6fb7 · 2025-02-13T20:15:55.000-08:00
diff --git a/Makefile.am b/Makefile.am
@@ -175,7 +175,6 @@ libzimg_internal_la_SOURCES += \
 	src/zimg/common/x86/avx512_util.h \
 	src/zimg/common/x86/cpuinfo_x86.cpp \
 	src/zimg/common/x86/cpuinfo_x86.h \
-	src/zimg/common/x86/sse2_util.h \
 	src/zimg/common/x86/x86util.cpp \
 	src/zimg/common/x86/x86util.h \
 	src/zimg/depth/x86/depth_convert_x86.cpp \
diff --git a/_msvc/zimg/zimg.vcxproj b/_msvc/zimg/zimg.vcxproj
@@ -252,7 +252,6 @@
     <ClInclude Include="..\..\src\zimg\common\x86\avx2_util.h" />
     <ClInclude Include="..\..\src\zimg\common\x86\avx512_util.h" />
     <ClInclude Include="..\..\src\zimg\common\x86\cpuinfo_x86.h" />
-    <ClInclude Include="..\..\src\zimg\common\x86\sse2_util.h" />
     <ClInclude Include="..\..\src\zimg\common\x86\x86util.h" />
     <ClInclude Include="..\..\src\zimg\common\zassert.h" />
     <ClInclude Include="..\..\src\zimg\depth\arm\depth_convert_arm.h" />
diff --git a/src/zimg/common/x86/avx2_util.h b/src/zimg/common/x86/avx2_util.h
@@ -59,6 +59,76 @@ static inline FORCE_INLINE void mm256_exchange_lanes_si128(__m256i &row0, __m256
 } // namespace _avx2
 
 
+// Store from [x] into [dst] the 8-bit elements with index less than [idx].
+static inline FORCE_INLINE void mm_store_idxlo_epi8(__m128i *dst, __m128i x, unsigned idx)
+{
+	__m128i orig = _mm_load_si128(dst);
+	__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx]));
+
+	x = _mm_blendv_epi8(orig, x, mask);
+	_mm_store_si128(dst, x);
+}
+
+// Store from [x] into [dst] the 8-bit elements with index greater than or equal to [idx].
+static inline FORCE_INLINE void mm_store_idxhi_epi8(__m128i *dst, __m128i x, unsigned idx)
+{
+	__m128i orig = _mm_load_si128(dst);
+	__m128i mask = _mm_load_si128((const __m128i *)(&ymm_mask_table[idx]));
+
+	x = _mm_blendv_epi8(x, orig, mask);
+	_mm_store_si128(dst, x);
+}
+
+// Store from [x] into [dst] the 16-bit elements with index less than [idx].
+static inline FORCE_INLINE void mm_store_idxlo_epi16(__m128i *dst, __m128i x, unsigned idx)
+{
+	mm_store_idxlo_epi8(dst, x, idx * 2);
+}
+
+// Store from [x] into [dst] the 16-bit elements with index greater than or equal to [idx].
+static inline FORCE_INLINE void mm_store_idxhi_epi16(__m128i *dst, __m128i x, unsigned idx)
+{
+	mm_store_idxhi_epi8(dst, x, idx * 2);
+}
+
+// Store from [x] into [dst] the 32-bit elements with index less than [idx].
+static inline FORCE_INLINE void mm_store_idxlo_ps(float *dst, __m128 x, unsigned idx)
+{
+	__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx * 4]));
+	_mm_maskstore_ps(dst, mask, x);
+}
+
+// Store from [x] into [dst] the 32-bit elements with index greater than or equal to [idx]
+static inline FORCE_INLINE void mm_store_idxhi_ps(float *dst, __m128 x, unsigned idx)
+{
+	__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx * 4]));
+	mask = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(_mm_set1_epi32(-1))));
+	_mm_maskstore_ps(dst, mask, x);
+}
+
+// Stores the elements of [x] into [dst0]-[dst7].
+static inline FORCE_INLINE void mm_scatter_epi16(uint16_t *dst0, uint16_t *dst1, uint16_t *dst2, uint16_t *dst3,
+                                                 uint16_t *dst4, uint16_t *dst5, uint16_t *dst6, uint16_t *dst7, __m128i x)
+{
+	*dst0 = _mm_extract_epi16(x, 0);
+	*dst1 = _mm_extract_epi16(x, 1);
+	*dst2 = _mm_extract_epi16(x, 2);
+	*dst3 = _mm_extract_epi16(x, 3);
+	*dst4 = _mm_extract_epi16(x, 4);
+	*dst5 = _mm_extract_epi16(x, 5);
+	*dst6 = _mm_extract_epi16(x, 6);
+	*dst7 = _mm_extract_epi16(x, 7);
+}
+
+// Stores the elements of [x] into [dst0]-[dst3].
+static inline FORCE_INLINE void mm_scatter_ps(float *dst0, float *dst1, float *dst2, float *dst3, __m128 x)
+{
+	_mm_store_ss(dst0, x);
+	*(uint32_t *)dst1 = _mm_extract_ps(x, 1);
+	*(uint32_t *)dst2 = _mm_extract_ps(x, 2);
+	*(uint32_t *)dst3 = _mm_extract_ps(x, 3);
+}
+
 // Store from [x] into [dst] the 8-bit elements with index less than [idx].
 static inline FORCE_INLINE void mm256_store_idxlo_epi8(__m256i *dst, __m256i x, unsigned idx)
 {
@@ -106,6 +176,41 @@ static inline FORCE_INLINE void mm256_store_idxhi_ps(float *dst, __m256 x, unsig
 	_mm256_maskstore_ps(dst, mask, x);
 }
 
+// Transpose in-place the 8x8 matrix stored in [row0]-[row7].
+static inline FORCE_INLINE void mm_transpose8_epi16(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3,
+                                                    __m128i &row4, __m128i &row5, __m128i &row6, __m128i &row7)
+{
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
+	__m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
+
+	t0 = _mm_unpacklo_epi16(row0, row1);
+	t1 = _mm_unpacklo_epi16(row2, row3);
+	t2 = _mm_unpacklo_epi16(row4, row5);
+	t3 = _mm_unpacklo_epi16(row6, row7);
+	t4 = _mm_unpackhi_epi16(row0, row1);
+	t5 = _mm_unpackhi_epi16(row2, row3);
+	t6 = _mm_unpackhi_epi16(row4, row5);
+	t7 = _mm_unpackhi_epi16(row6, row7);
+
+	tt0 = _mm_unpacklo_epi32(t0, t1);
+	tt1 = _mm_unpackhi_epi32(t0, t1);
+	tt2 = _mm_unpacklo_epi32(t2, t3);
+	tt3 = _mm_unpackhi_epi32(t2, t3);
+	tt4 = _mm_unpacklo_epi32(t4, t5);
+	tt5 = _mm_unpackhi_epi32(t4, t5);
+	tt6 = _mm_unpacklo_epi32(t6, t7);
+	tt7 = _mm_unpackhi_epi32(t6, t7);
+
+	row0 = _mm_unpacklo_epi64(tt0, tt2);
+	row1 = _mm_unpackhi_epi64(tt0, tt2);
+	row2 = _mm_unpacklo_epi64(tt1, tt3);
+	row3 = _mm_unpackhi_epi64(tt1, tt3);
+	row4 = _mm_unpacklo_epi64(tt4, tt6);
+	row5 = _mm_unpackhi_epi64(tt4, tt6);
+	row6 = _mm_unpacklo_epi64(tt5, tt7);
+	row7 = _mm_unpackhi_epi64(tt5, tt7);
+}
+
 // Transpose in-place the 16x16 matrix stored in [row0]-[row15].
 static inline FORCE_INLINE void mm256_transpose16_epi16(__m256i &row0, __m256i &row1, __m256i &row2, __m256i &row3,
                                                         __m256i &row4, __m256i &row5, __m256i &row6, __m256i &row7,
diff --git a/src/zimg/common/x86/sse2_util.h b/src/zimg/common/x86/sse2_util.h
diff --git a/src/zimg/depth/x86/depth_convert_avx2.cpp b/src/zimg/depth/x86/depth_convert_avx2.cpp
@@ -7,7 +7,6 @@
 #include "common/ccdep.h"
 #include "depth_convert_x86.h"
 
-#include "common/x86/sse2_util.h"
 #include "common/x86/avx2_util.h"
 
 namespace zimg::depth {
diff --git a/src/zimg/depth/x86/dither_avx2.cpp b/src/zimg/depth/x86/dither_avx2.cpp
@@ -5,7 +5,6 @@
 #include "common/align.h"
 #include "dither_x86.h"
 
-#include "common/x86/sse2_util.h"
 #include "common/x86/avx2_util.h"
 
 namespace zimg::depth {
diff --git a/src/zimg/resize/x86/resize_impl_avx2.cpp b/src/zimg/resize/x86/resize_impl_avx2.cpp
@@ -19,7 +19,6 @@
 #include "resize/resize_impl.h"
 #include "resize_impl_x86.h"
 
-#include "common/x86/sse2_util.h"
 #include "common/x86/avx2_util.h"
 
 namespace zimg::resize {
diff --git a/src/zimg/resize/x86/resize_impl_avx512.cpp b/src/zimg/resize/x86/resize_impl_avx512.cpp
@@ -17,7 +17,6 @@
 #include "resize/resize_impl.h"
 #include "resize_impl_x86.h"
 
-#include "common/x86/sse2_util.h"
 #include "common/x86/avx2_util.h"
 #include "common/x86/avx512_util.h"
 
diff --git a/src/zimg/resize/x86/resize_impl_avx512_common.h b/src/zimg/resize/x86/resize_impl_avx512_common.h
@@ -15,7 +15,7 @@
 #include "graph/filter_base.h"
 #include "resize/resize_impl.h"
 
-#include "common/x86/sse2_util.h"
+#include "common/x86/avx2_util.h"
 #include "common/x86/avx512_util.h"
 
 namespace zimg::resize {
diff --git a/src/zimg/unresize/x86/unresize_impl_avx2.cpp b/src/zimg/unresize/x86/unresize_impl_avx2.cpp
@@ -13,7 +13,6 @@
 #include "unresize_impl_x86.h"
 
 #include "common/x86/avx2_util.h"
-#include "common/x86/sse2_util.h"
 
 namespace zimg::unresize {