@@ -59,6 +59,76 @@ static inline FORCE_INLINE void mm256_exchange_lanes_si128(__m256i &row0, __m256
5959} // namespace _avx2
6060
6161
62+ // Store from [x] into [dst] the 8-bit elements with index less than [idx].
63+ static inline FORCE_INLINE void mm_store_idxlo_epi8 (__m128i *dst, __m128i x, unsigned idx)
64+ {
65+ __m128i orig = _mm_load_si128 (dst);
66+ __m128i mask = _mm_load_si128 ((const __m128i *)(&xmm_mask_table[idx]));
67+
68+ x = _mm_blendv_epi8 (orig, x, mask);
69+ _mm_store_si128 (dst, x);
70+ }
71+
72+ // Store from [x] into [dst] the 8-bit elements with index greater than or equal to [idx].
73+ static inline FORCE_INLINE void mm_store_idxhi_epi8 (__m128i *dst, __m128i x, unsigned idx)
74+ {
75+ __m128i orig = _mm_load_si128 (dst);
76+ __m128i mask = _mm_load_si128 ((const __m128i *)(&ymm_mask_table[idx]));
77+
78+ x = _mm_blendv_epi8 (x, orig, mask);
79+ _mm_store_si128 (dst, x);
80+ }
81+
82+ // Store from [x] into [dst] the 16-bit elements with index less than [idx].
83+ static inline FORCE_INLINE void mm_store_idxlo_epi16 (__m128i *dst, __m128i x, unsigned idx)
84+ {
85+ mm_store_idxlo_epi8 (dst, x, idx * 2 );
86+ }
87+
88+ // Store from [x] into [dst] the 16-bit elements with index greater than or equal to [idx].
89+ static inline FORCE_INLINE void mm_store_idxhi_epi16 (__m128i *dst, __m128i x, unsigned idx)
90+ {
91+ mm_store_idxhi_epi8 (dst, x, idx * 2 );
92+ }
93+
94+ // Store from [x] into [dst] the 32-bit elements with index less than [idx].
95+ static inline FORCE_INLINE void mm_store_idxlo_ps (float *dst, __m128 x, unsigned idx)
96+ {
97+ __m128i mask = _mm_load_si128 ((const __m128i *)(&xmm_mask_table[idx * 4 ]));
98+ _mm_maskstore_ps (dst, mask, x);
99+ }
100+
101+ // Store from [x] into [dst] the 32-bit elements with index greater than or equal to [idx]
102+ static inline FORCE_INLINE void mm_store_idxhi_ps (float *dst, __m128 x, unsigned idx)
103+ {
104+ __m128i mask = _mm_load_si128 ((const __m128i *)(&xmm_mask_table[idx * 4 ]));
105+ mask = _mm_castps_si128 (_mm_xor_ps (_mm_castsi128_ps (mask), _mm_castsi128_ps (_mm_set1_epi32 (-1 ))));
106+ _mm_maskstore_ps (dst, mask, x);
107+ }
108+
109+ // Stores the elements of [x] into [dst0]-[dst7].
110+ static inline FORCE_INLINE void mm_scatter_epi16 (uint16_t *dst0, uint16_t *dst1, uint16_t *dst2, uint16_t *dst3,
111+ uint16_t *dst4, uint16_t *dst5, uint16_t *dst6, uint16_t *dst7, __m128i x)
112+ {
113+ *dst0 = _mm_extract_epi16 (x, 0 );
114+ *dst1 = _mm_extract_epi16 (x, 1 );
115+ *dst2 = _mm_extract_epi16 (x, 2 );
116+ *dst3 = _mm_extract_epi16 (x, 3 );
117+ *dst4 = _mm_extract_epi16 (x, 4 );
118+ *dst5 = _mm_extract_epi16 (x, 5 );
119+ *dst6 = _mm_extract_epi16 (x, 6 );
120+ *dst7 = _mm_extract_epi16 (x, 7 );
121+ }
122+
123+ // Stores the elements of [x] into [dst0]-[dst3].
124+ static inline FORCE_INLINE void mm_scatter_ps (float *dst0, float *dst1, float *dst2, float *dst3, __m128 x)
125+ {
126+ _mm_store_ss (dst0, x);
127+ *(uint32_t *)dst1 = _mm_extract_ps (x, 1 );
128+ *(uint32_t *)dst2 = _mm_extract_ps (x, 2 );
129+ *(uint32_t *)dst3 = _mm_extract_ps (x, 3 );
130+ }
131+
62132// Store from [x] into [dst] the 8-bit elements with index less than [idx].
63133static inline FORCE_INLINE void mm256_store_idxlo_epi8 (__m256i *dst, __m256i x, unsigned idx)
64134{
@@ -106,6 +176,41 @@ static inline FORCE_INLINE void mm256_store_idxhi_ps(float *dst, __m256 x, unsig
106176 _mm256_maskstore_ps (dst, mask, x);
107177}
108178
179+ // Transpose in-place the 8x8 matrix stored in [row0]-[row7].
180+ static inline FORCE_INLINE void mm_transpose8_epi16 (__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3,
181+ __m128i &row4, __m128i &row5, __m128i &row6, __m128i &row7)
182+ {
183+ __m128i t0, t1, t2, t3, t4, t5, t6, t7;
184+ __m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
185+
186+ t0 = _mm_unpacklo_epi16 (row0, row1);
187+ t1 = _mm_unpacklo_epi16 (row2, row3);
188+ t2 = _mm_unpacklo_epi16 (row4, row5);
189+ t3 = _mm_unpacklo_epi16 (row6, row7);
190+ t4 = _mm_unpackhi_epi16 (row0, row1);
191+ t5 = _mm_unpackhi_epi16 (row2, row3);
192+ t6 = _mm_unpackhi_epi16 (row4, row5);
193+ t7 = _mm_unpackhi_epi16 (row6, row7);
194+
195+ tt0 = _mm_unpacklo_epi32 (t0, t1);
196+ tt1 = _mm_unpackhi_epi32 (t0, t1);
197+ tt2 = _mm_unpacklo_epi32 (t2, t3);
198+ tt3 = _mm_unpackhi_epi32 (t2, t3);
199+ tt4 = _mm_unpacklo_epi32 (t4, t5);
200+ tt5 = _mm_unpackhi_epi32 (t4, t5);
201+ tt6 = _mm_unpacklo_epi32 (t6, t7);
202+ tt7 = _mm_unpackhi_epi32 (t6, t7);
203+
204+ row0 = _mm_unpacklo_epi64 (tt0, tt2);
205+ row1 = _mm_unpackhi_epi64 (tt0, tt2);
206+ row2 = _mm_unpacklo_epi64 (tt1, tt3);
207+ row3 = _mm_unpackhi_epi64 (tt1, tt3);
208+ row4 = _mm_unpacklo_epi64 (tt4, tt6);
209+ row5 = _mm_unpackhi_epi64 (tt4, tt6);
210+ row6 = _mm_unpacklo_epi64 (tt5, tt7);
211+ row7 = _mm_unpackhi_epi64 (tt5, tt7);
212+ }
213+
109214// Transpose in-place the 16x16 matrix stored in [row0]-[row15].
110215static inline FORCE_INLINE void mm256_transpose16_epi16 (__m256i &row0, __m256i &row1, __m256i &row2, __m256i &row3,
111216 __m256i &row4, __m256i &row5, __m256i &row6, __m256i &row7,
0 commit comments