Skip to content

Commit 36f39e5

Browse files
author
sekrit-twc
committed
x86: merge SSE2 and AVX2 helpers
1 parent aa0f697 commit 36f39e5

File tree

10 files changed

+106
-172
lines changed

10 files changed

+106
-172
lines changed

Makefile.am

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ libzimg_internal_la_SOURCES += \
175175
src/zimg/common/x86/avx512_util.h \
176176
src/zimg/common/x86/cpuinfo_x86.cpp \
177177
src/zimg/common/x86/cpuinfo_x86.h \
178-
src/zimg/common/x86/sse2_util.h \
179178
src/zimg/common/x86/x86util.cpp \
180179
src/zimg/common/x86/x86util.h \
181180
src/zimg/depth/x86/depth_convert_x86.cpp \

_msvc/zimg/zimg.vcxproj

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@
252252
<ClInclude Include="..\..\src\zimg\common\x86\avx2_util.h" />
253253
<ClInclude Include="..\..\src\zimg\common\x86\avx512_util.h" />
254254
<ClInclude Include="..\..\src\zimg\common\x86\cpuinfo_x86.h" />
255-
<ClInclude Include="..\..\src\zimg\common\x86\sse2_util.h" />
256255
<ClInclude Include="..\..\src\zimg\common\x86\x86util.h" />
257256
<ClInclude Include="..\..\src\zimg\common\zassert.h" />
258257
<ClInclude Include="..\..\src\zimg\depth\arm\depth_convert_arm.h" />

src/zimg/common/x86/avx2_util.h

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,76 @@ static inline FORCE_INLINE void mm256_exchange_lanes_si128(__m256i &row0, __m256
5959
} // namespace _avx2
6060

6161

62+
// Store from [x] into [dst] the 8-bit elements with index less than [idx].
63+
static inline FORCE_INLINE void mm_store_idxlo_epi8(__m128i *dst, __m128i x, unsigned idx)
64+
{
65+
__m128i orig = _mm_load_si128(dst);
66+
__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx]));
67+
68+
x = _mm_blendv_epi8(orig, x, mask);
69+
_mm_store_si128(dst, x);
70+
}
71+
72+
// Store from [x] into [dst] the 8-bit elements with index greater than or equal to [idx].
73+
static inline FORCE_INLINE void mm_store_idxhi_epi8(__m128i *dst, __m128i x, unsigned idx)
74+
{
75+
__m128i orig = _mm_load_si128(dst);
76+
__m128i mask = _mm_load_si128((const __m128i *)(&ymm_mask_table[idx]));
77+
78+
x = _mm_blendv_epi8(x, orig, mask);
79+
_mm_store_si128(dst, x);
80+
}
81+
82+
// Store from [x] into [dst] the 16-bit elements with index less than [idx].
83+
static inline FORCE_INLINE void mm_store_idxlo_epi16(__m128i *dst, __m128i x, unsigned idx)
84+
{
85+
mm_store_idxlo_epi8(dst, x, idx * 2);
86+
}
87+
88+
// Store from [x] into [dst] the 16-bit elements with index greater than or equal to [idx].
89+
static inline FORCE_INLINE void mm_store_idxhi_epi16(__m128i *dst, __m128i x, unsigned idx)
90+
{
91+
mm_store_idxhi_epi8(dst, x, idx * 2);
92+
}
93+
94+
// Store from [x] into [dst] the 32-bit elements with index less than [idx].
95+
static inline FORCE_INLINE void mm_store_idxlo_ps(float *dst, __m128 x, unsigned idx)
96+
{
97+
__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx * 4]));
98+
_mm_maskstore_ps(dst, mask, x);
99+
}
100+
101+
// Store from [x] into [dst] the 32-bit elements with index greater than or equal to [idx]
102+
static inline FORCE_INLINE void mm_store_idxhi_ps(float *dst, __m128 x, unsigned idx)
103+
{
104+
__m128i mask = _mm_load_si128((const __m128i *)(&xmm_mask_table[idx * 4]));
105+
mask = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(_mm_set1_epi32(-1))));
106+
_mm_maskstore_ps(dst, mask, x);
107+
}
108+
109+
// Stores the elements of [x] into [dst0]-[dst7].
110+
static inline FORCE_INLINE void mm_scatter_epi16(uint16_t *dst0, uint16_t *dst1, uint16_t *dst2, uint16_t *dst3,
111+
uint16_t *dst4, uint16_t *dst5, uint16_t *dst6, uint16_t *dst7, __m128i x)
112+
{
113+
*dst0 = _mm_extract_epi16(x, 0);
114+
*dst1 = _mm_extract_epi16(x, 1);
115+
*dst2 = _mm_extract_epi16(x, 2);
116+
*dst3 = _mm_extract_epi16(x, 3);
117+
*dst4 = _mm_extract_epi16(x, 4);
118+
*dst5 = _mm_extract_epi16(x, 5);
119+
*dst6 = _mm_extract_epi16(x, 6);
120+
*dst7 = _mm_extract_epi16(x, 7);
121+
}
122+
123+
// Stores the elements of [x] into [dst0]-[dst3].
124+
static inline FORCE_INLINE void mm_scatter_ps(float *dst0, float *dst1, float *dst2, float *dst3, __m128 x)
125+
{
126+
_mm_store_ss(dst0, x);
127+
*(uint32_t *)dst1 = _mm_extract_ps(x, 1);
128+
*(uint32_t *)dst2 = _mm_extract_ps(x, 2);
129+
*(uint32_t *)dst3 = _mm_extract_ps(x, 3);
130+
}
131+
62132
// Store from [x] into [dst] the 8-bit elements with index less than [idx].
63133
static inline FORCE_INLINE void mm256_store_idxlo_epi8(__m256i *dst, __m256i x, unsigned idx)
64134
{
@@ -106,6 +176,41 @@ static inline FORCE_INLINE void mm256_store_idxhi_ps(float *dst, __m256 x, unsig
106176
_mm256_maskstore_ps(dst, mask, x);
107177
}
108178

179+
// Transpose in-place the 8x8 matrix stored in [row0]-[row7].
180+
static inline FORCE_INLINE void mm_transpose8_epi16(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3,
181+
__m128i &row4, __m128i &row5, __m128i &row6, __m128i &row7)
182+
{
183+
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
184+
__m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
185+
186+
t0 = _mm_unpacklo_epi16(row0, row1);
187+
t1 = _mm_unpacklo_epi16(row2, row3);
188+
t2 = _mm_unpacklo_epi16(row4, row5);
189+
t3 = _mm_unpacklo_epi16(row6, row7);
190+
t4 = _mm_unpackhi_epi16(row0, row1);
191+
t5 = _mm_unpackhi_epi16(row2, row3);
192+
t6 = _mm_unpackhi_epi16(row4, row5);
193+
t7 = _mm_unpackhi_epi16(row6, row7);
194+
195+
tt0 = _mm_unpacklo_epi32(t0, t1);
196+
tt1 = _mm_unpackhi_epi32(t0, t1);
197+
tt2 = _mm_unpacklo_epi32(t2, t3);
198+
tt3 = _mm_unpackhi_epi32(t2, t3);
199+
tt4 = _mm_unpacklo_epi32(t4, t5);
200+
tt5 = _mm_unpackhi_epi32(t4, t5);
201+
tt6 = _mm_unpacklo_epi32(t6, t7);
202+
tt7 = _mm_unpackhi_epi32(t6, t7);
203+
204+
row0 = _mm_unpacklo_epi64(tt0, tt2);
205+
row1 = _mm_unpackhi_epi64(tt0, tt2);
206+
row2 = _mm_unpacklo_epi64(tt1, tt3);
207+
row3 = _mm_unpackhi_epi64(tt1, tt3);
208+
row4 = _mm_unpacklo_epi64(tt4, tt6);
209+
row5 = _mm_unpackhi_epi64(tt4, tt6);
210+
row6 = _mm_unpacklo_epi64(tt5, tt7);
211+
row7 = _mm_unpackhi_epi64(tt5, tt7);
212+
}
213+
109214
// Transpose in-place the 16x16 matrix stored in [row0]-[row15].
110215
static inline FORCE_INLINE void mm256_transpose16_epi16(__m256i &row0, __m256i &row1, __m256i &row2, __m256i &row3,
111216
__m256i &row4, __m256i &row5, __m256i &row6, __m256i &row7,

src/zimg/common/x86/sse2_util.h

Lines changed: 0 additions & 164 deletions
This file was deleted.

src/zimg/depth/x86/depth_convert_avx2.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "common/ccdep.h"
88
#include "depth_convert_x86.h"
99

10-
#include "common/x86/sse2_util.h"
1110
#include "common/x86/avx2_util.h"
1211

1312
namespace zimg::depth {

src/zimg/depth/x86/dither_avx2.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "common/align.h"
66
#include "dither_x86.h"
77

8-
#include "common/x86/sse2_util.h"
98
#include "common/x86/avx2_util.h"
109

1110
namespace zimg::depth {

src/zimg/resize/x86/resize_impl_avx2.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include "resize/resize_impl.h"
2020
#include "resize_impl_x86.h"
2121

22-
#include "common/x86/sse2_util.h"
2322
#include "common/x86/avx2_util.h"
2423

2524
namespace zimg::resize {

src/zimg/resize/x86/resize_impl_avx512.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "resize/resize_impl.h"
1818
#include "resize_impl_x86.h"
1919

20-
#include "common/x86/sse2_util.h"
2120
#include "common/x86/avx2_util.h"
2221
#include "common/x86/avx512_util.h"
2322

src/zimg/resize/x86/resize_impl_avx512_common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include "graph/filter_base.h"
1616
#include "resize/resize_impl.h"
1717

18-
#include "common/x86/sse2_util.h"
18+
#include "common/x86/avx2_util.h"
1919
#include "common/x86/avx512_util.h"
2020

2121
namespace zimg::resize {

src/zimg/unresize/x86/unresize_impl_avx2.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include "unresize_impl_x86.h"
1414

1515
#include "common/x86/avx2_util.h"
16-
#include "common/x86/sse2_util.h"
1716

1817
namespace zimg::unresize {
1918

0 commit comments

Comments
 (0)