4
4
// This source code is a part of Nightingales.
5
5
//
6
6
#![ allow( unused_imports) ]
7
+ use packed_simd:: { self as simd, Cast } ;
7
8
#[ cfg( target_arch = "x86" ) ]
8
9
use std:: arch:: x86 as vendor;
9
10
#[ cfg( target_arch = "x86_64" ) ]
10
11
use std:: arch:: x86_64 as vendor;
11
- use std:: simd :: { self , IntoBits } ;
12
+ use std:: mem :: transmute ;
12
13
use { intrin, simd16} ;
13
14
use { ScalarMode , SimdMode } ;
14
15
@@ -21,7 +22,8 @@ pub trait MapU8x4InplaceKernel {
21
22
pub trait MapU8x4InplaceKernelExt : MapU8x4InplaceKernel {
22
23
/// Run a mapping kernel on a given slice.
23
24
fn dispatch ( & self , slice : & mut [ u8 ] ) {
24
- let _ = self . dispatch_simd16_masked ( slice) || self . dispatch_simd16_unaligned ( slice)
25
+ let _ = self . dispatch_simd16_masked ( slice)
26
+ || self . dispatch_simd16_unaligned ( slice)
25
27
|| self . dispatch_scalar ( slice) ;
26
28
}
27
29
@@ -74,22 +76,22 @@ pub trait MapU8x4InplaceKernelExt: MapU8x4InplaceKernel {
74
76
while i < end_addr {
75
77
let indices1 = indices0 + simd:: i32x8:: splat ( 16 ) ;
76
78
77
- let indices0b: simd:: u32x8 = indices0. into_bits ( ) ;
78
- let indices1b: simd:: u32x8 = indices1. into_bits ( ) ;
79
+ let indices0b: simd:: u32x8 = indices0. cast ( ) ;
80
+ let indices1b: simd:: u32x8 = indices1. cast ( ) ;
79
81
80
- let indices0c: simd:: i32x8 = ( indices0b >> 1u32 ) . into_bits ( ) ;
81
- let indices1c: simd:: i32x8 = ( indices1b >> 1u32 ) . into_bits ( ) ;
82
+ let indices0c: simd:: i32x8 = ( indices0b >> 1u32 ) . cast ( ) ;
83
+ let indices1c: simd:: i32x8 = ( indices1b >> 1u32 ) . cast ( ) ;
82
84
83
- let mask0 = indices0c. lt ( bounds) . into_bits ( ) ;
84
- let mask1 = indices1c. lt ( bounds) . into_bits ( ) ;
85
+ let mask0 = transmute ( indices0c. lt ( bounds) ) ;
86
+ let mask1 = transmute ( indices1c. lt ( bounds) ) ;
85
87
86
- let a0 = vendor:: _mm256_maskload_epi32 ( i as * const _ , mask0) . into_bits ( ) ;
87
- let a1 = vendor:: _mm256_maskload_epi32 ( ( i + 32 ) as * const _ , mask1) . into_bits ( ) ;
88
+ let a0 = vendor:: _mm256_maskload_epi32 ( i as * const _ , mask0) ;
89
+ let a1 = vendor:: _mm256_maskload_epi32 ( ( i + 32 ) as * const _ , mask1) ;
88
90
89
91
let c = self . dispatch_simd16_m256 ( [ a0, a1] ) ;
90
92
91
- vendor:: _mm256_maskstore_epi32 ( i as * mut _ , mask0, c[ 0 ] . into_bits ( ) ) ;
92
- vendor:: _mm256_maskstore_epi32 ( ( i + 32 ) as * mut _ , mask1, c[ 1 ] . into_bits ( ) ) ;
93
+ vendor:: _mm256_maskstore_epi32 ( i as * mut _ , mask0, c[ 0 ] ) ;
94
+ vendor:: _mm256_maskstore_epi32 ( ( i + 32 ) as * mut _ , mask1, c[ 1 ] ) ;
93
95
94
96
indices0 += simd:: i32x8:: splat ( 32 ) ;
95
97
i += 64 ;
@@ -111,13 +113,18 @@ pub trait MapU8x4InplaceKernelExt: MapU8x4InplaceKernel {
111
113
let mut i = 0 ;
112
114
while i + 63 < slice. len ( ) {
113
115
unsafe {
114
- let a0 = vendor:: _mm_loadu_si128 ( p as * const vendor:: __m128i ) . into_bits ( ) ;
115
- let a1 =
116
- vendor:: _mm_loadu_si128 ( p. offset ( 16 ) as * const vendor:: __m128i ) . into_bits ( ) ;
117
- let a2 =
118
- vendor:: _mm_loadu_si128 ( p. offset ( 32 ) as * const vendor:: __m128i ) . into_bits ( ) ;
119
- let a3 =
120
- vendor:: _mm_loadu_si128 ( p. offset ( 48 ) as * const vendor:: __m128i ) . into_bits ( ) ;
116
+ let a0 = transmute ( vendor:: _mm_loadu_si128 (
117
+ p. offset ( 0 ) as * const vendor:: __m128i
118
+ ) ) ;
119
+ let a1 = transmute ( vendor:: _mm_loadu_si128 (
120
+ p. offset ( 16 ) as * const vendor:: __m128i
121
+ ) ) ;
122
+ let a2 = transmute ( vendor:: _mm_loadu_si128 (
123
+ p. offset ( 32 ) as * const vendor:: __m128i
124
+ ) ) ;
125
+ let a3 = transmute ( vendor:: _mm_loadu_si128 (
126
+ p. offset ( 48 ) as * const vendor:: __m128i
127
+ ) ) ;
121
128
122
129
let f = self . dispatch_simd16_m128 ( [ a0, a1, a2, a3] ) ;
123
130
@@ -142,9 +149,8 @@ pub trait MapU8x4InplaceKernelExt: MapU8x4InplaceKernel {
142
149
let mut i = 0 ;
143
150
while i + 63 < slice. len ( ) {
144
151
unsafe {
145
- let a0 = vendor:: _mm256_loadu_si256 ( p as * const vendor:: __m256i ) . into_bits ( ) ;
146
- let a1 =
147
- vendor:: _mm256_loadu_si256 ( p. offset ( 32 ) as * const vendor:: __m256i ) . into_bits ( ) ;
152
+ let a0 = vendor:: _mm256_loadu_si256 ( p as * const vendor:: __m256i ) ;
153
+ let a1 = vendor:: _mm256_loadu_si256 ( p. offset ( 32 ) as * const vendor:: __m256i ) ;
148
154
149
155
let f = self . dispatch_simd16_m256 ( [ a0, a1] ) ;
150
156
@@ -174,55 +180,55 @@ pub trait MapU8x4InplaceKernelExt: MapU8x4InplaceKernel {
174
180
let a0 = a[ 0 ] ; // hgfedcba 3210
175
181
let a1 = a[ 1 ] ; // ponmlkji 3210
176
182
177
- let transpose4x4 = simd:: u8x32:: new (
183
+ let transpose4x4 = transmute ( simd:: u8x32:: new (
178
184
0 , 4 , 8 , 12 , 1 , 5 , 9 , 13 , 2 , 6 , 10 , 14 , 3 , 7 , 11 , 15 , 0 , 4 , 8 , 12 , 1 , 5 , 9 , 13 , 2 , 6 ,
179
185
10 , 14 , 3 , 7 , 11 , 15 ,
180
- ) . into_bits ( ) ;
186
+ ) ) ;
181
187
182
- let b0 = vendor:: _mm256_shuffle_epi8 ( a0. into_bits ( ) , transpose4x4) . into_bits ( ) ; // 3210 hgfe / 3210 dcba
183
- let b1 = vendor:: _mm256_shuffle_epi8 ( a1. into_bits ( ) , transpose4x4) . into_bits ( ) ; // 3210 ponm / 3210 lkji
188
+ let b0 = vendor:: _mm256_shuffle_epi8 ( a0, transpose4x4) ; // 3210 hgfe / 3210 dcba
189
+ let b1 = vendor:: _mm256_shuffle_epi8 ( a1, transpose4x4) ; // 3210 ponm / 3210 lkji
184
190
185
- let transpose4x2 = simd:: u32x8:: new ( 0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 ) . into_bits ( ) ;
191
+ let transpose4x2 = transmute ( simd:: u32x8:: new ( 0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 ) ) ;
186
192
187
- let c0 = vendor:: _mm256_permutevar8x32_epi32 ( b0, transpose4x2) . into_bits ( ) ; // 3210 hgfedcba
188
- let c1 = vendor:: _mm256_permutevar8x32_epi32 ( b1, transpose4x2) . into_bits ( ) ; // 3210 ponmlkji
193
+ let c0 = vendor:: _mm256_permutevar8x32_epi32 ( b0, transpose4x2) ; // 3210 hgfedcba
194
+ let c1 = vendor:: _mm256_permutevar8x32_epi32 ( b1, transpose4x2) ; // 3210 ponmlkji
189
195
190
- let d0 = vendor:: _mm256_unpacklo_epi64 ( c0, c1) . into_bits ( ) ; // 20 ponmlkjihgfedcba
191
- let d1 = vendor:: _mm256_unpackhi_epi64 ( c0, c1) . into_bits ( ) ; // 31 ponmlkjihgfedcba
196
+ let d0 = vendor:: _mm256_unpacklo_epi64 ( c0, c1) ; // 20 ponmlkjihgfedcba
197
+ let d1 = vendor:: _mm256_unpackhi_epi64 ( c0, c1) ; // 31 ponmlkjihgfedcba
192
198
193
- let e0 = vendor:: _mm256_extractf128_si256 ( d0, 0 ) . into_bits ( ) ; // 0 ponmlkjihgfedcba
194
- let e1 = vendor:: _mm256_extractf128_si256 ( d1, 0 ) . into_bits ( ) ; // 1 ponmlkjihgfedcba
195
- let e2 = vendor:: _mm256_extractf128_si256 ( d0, 1 ) . into_bits ( ) ; // 2 ponmlkjihgfedcba
196
- let e3 = vendor:: _mm256_extractf128_si256 ( d1, 1 ) . into_bits ( ) ; // 3 ponmlkjihgfedcba
199
+ let e0 = vendor:: _mm256_extractf128_si256 ( d0, 0 ) ; // 0 ponmlkjihgfedcba
200
+ let e1 = vendor:: _mm256_extractf128_si256 ( d1, 0 ) ; // 1 ponmlkjihgfedcba
201
+ let e2 = vendor:: _mm256_extractf128_si256 ( d0, 1 ) ; // 2 ponmlkjihgfedcba
202
+ let e3 = vendor:: _mm256_extractf128_si256 ( d1, 1 ) ; // 3 ponmlkjihgfedcba
197
203
198
204
let f = self . apply :: < simd16:: Simd16Mode > ( [
199
- simd16:: Simd16U8 ( e0 ) ,
200
- simd16:: Simd16U8 ( e1 ) ,
201
- simd16:: Simd16U8 ( e2 ) ,
202
- simd16:: Simd16U8 ( e3 ) ,
205
+ simd16:: Simd16U8 ( transmute ( e0 ) ) ,
206
+ simd16:: Simd16U8 ( transmute ( e1 ) ) ,
207
+ simd16:: Simd16U8 ( transmute ( e2 ) ) ,
208
+ simd16:: Simd16U8 ( transmute ( e3 ) ) ,
203
209
] ) ;
204
210
205
- let f0 = f[ 0 ] . 0 . into_bits ( ) ;
206
- let f1 = f[ 1 ] . 0 . into_bits ( ) ;
207
- let f2 = f[ 2 ] . 0 . into_bits ( ) ;
208
- let f3 = f[ 3 ] . 0 . into_bits ( ) ;
211
+ let f0 = transmute ( f[ 0 ] . 0 ) ;
212
+ let f1 = transmute ( f[ 1 ] . 0 ) ;
213
+ let f2 = transmute ( f[ 2 ] . 0 ) ;
214
+ let f3 = transmute ( f[ 3 ] . 0 ) ;
209
215
210
- let g0 = vendor:: _mm256_set_m128i ( f1, f0) . into_bits ( ) ; // 10 ponmlkjihgfedcba
211
- let g1 = vendor:: _mm256_set_m128i ( f3, f2) . into_bits ( ) ; // 32 ponmlkjihgfedcba
216
+ let g0 = vendor:: _mm256_set_m128i ( f1, f0) ; // 10 ponmlkjihgfedcba
217
+ let g1 = vendor:: _mm256_set_m128i ( f3, f2) ; // 32 ponmlkjihgfedcba
212
218
213
- let h0 = vendor:: _mm256_permute4x64_epi64 ( g0, 0b11_01_10_00 ) . into_bits ( ) ; // 10 ponmlkji / 10 hgfedcba
214
- let h1 = vendor:: _mm256_permute4x64_epi64 ( g1, 0b11_01_10_00 ) . into_bits ( ) ; // 32 ponmlkji / 32 hgfedcba
219
+ let h0 = vendor:: _mm256_permute4x64_epi64 ( g0, 0b11_01_10_00 ) ; // 10 ponmlkji / 10 hgfedcba
220
+ let h1 = vendor:: _mm256_permute4x64_epi64 ( g1, 0b11_01_10_00 ) ; // 32 ponmlkji / 32 hgfedcba
215
221
216
- let i0 = intrin:: mm256_permute2x128_si256 ( h0, h1, 0b0010_0000 ) . into_bits ( ) ; // 3210 hgfedcba
217
- let i1 = intrin:: mm256_permute2x128_si256 ( h0, h1, 0b0011_0001 ) . into_bits ( ) ; // 3210 ponmlkji
222
+ let i0 = intrin:: mm256_permute2x128_si256 ( h0, h1, 0b0010_0000 ) ; // 3210 hgfedcba
223
+ let i1 = intrin:: mm256_permute2x128_si256 ( h0, h1, 0b0011_0001 ) ; // 3210 ponmlkji
218
224
219
- let transpose2x4 = simd:: u32x8:: new ( 0 , 2 , 4 , 6 , 1 , 3 , 5 , 7 ) . into_bits ( ) ;
225
+ let transpose2x4 = transmute ( simd:: u32x8:: new ( 0 , 2 , 4 , 6 , 1 , 3 , 5 , 7 ) ) ;
220
226
221
- let j0 = vendor:: _mm256_permutevar8x32_epi32 ( i0, transpose2x4) . into_bits ( ) ; // 3210 hgfe / 3210 dcba
222
- let j1 = vendor:: _mm256_permutevar8x32_epi32 ( i1, transpose2x4) . into_bits ( ) ; // 3210 ponm / 3210 lkji
227
+ let j0 = vendor:: _mm256_permutevar8x32_epi32 ( i0, transpose2x4) ; // 3210 hgfe / 3210 dcba
228
+ let j1 = vendor:: _mm256_permutevar8x32_epi32 ( i1, transpose2x4) ; // 3210 ponm / 3210 lkji
223
229
224
- let k0 = vendor:: _mm256_shuffle_epi8 ( j0, transpose4x4) . into_bits ( ) ; // hgfedcba 3210
225
- let k1 = vendor:: _mm256_shuffle_epi8 ( j1, transpose4x4) . into_bits ( ) ; // ponmlkji 3210
230
+ let k0 = vendor:: _mm256_shuffle_epi8 ( j0, transpose4x4) ; // hgfedcba 3210
231
+ let k1 = vendor:: _mm256_shuffle_epi8 ( j1, transpose4x4) ; // ponmlkji 3210
226
232
227
233
[ k0, k1]
228
234
}
@@ -232,68 +238,68 @@ pub trait MapU8x4InplaceKernelExt: MapU8x4InplaceKernel {
232
238
#[ inline( always) ]
233
239
unsafe fn dispatch_simd16_m128 ( & self , x : [ vendor:: __m128i ; 4 ] ) -> [ vendor:: __m128i ; 4 ] {
234
240
// [ 3d 2d 1d 0d ] [ 3c 2c 1c 0c ] [ 3b 2b 1b 0b ] [ 3a 2a 1a 0a ]
235
- let a0 = x[ 0 ] . into_bits ( ) ; // dcba 3210
236
- let a1 = x[ 1 ] . into_bits ( ) ; // hgfe 3210
237
- let a2 = x[ 2 ] . into_bits ( ) ; // lkji 3210
238
- let a3 = x[ 3 ] . into_bits ( ) ; // ponm 3210
241
+ let a0 = x[ 0 ] ; // dcba 3210
242
+ let a1 = x[ 1 ] ; // hgfe 3210
243
+ let a2 = x[ 2 ] ; // lkji 3210
244
+ let a3 = x[ 3 ] ; // ponm 3210
239
245
240
246
// [ 3f 3b ] [ 2f 2b ] [ 1f 1b ] [ 0f 0b ] [ 3e 3a ] [ 2e 2a ] [ 1e 1a ] [ 0e 0a ]
241
- let b0 = vendor:: _mm_unpacklo_epi8 ( a0, a1) . into_bits ( ) ; // 3210 fb / 3210 ea
242
- let b1 = vendor:: _mm_unpackhi_epi8 ( a0, a1) . into_bits ( ) ; // 3210 hd / 3210 gc
243
- let b2 = vendor:: _mm_unpacklo_epi8 ( a2, a3) . into_bits ( ) ; // 3210 nj / 3210 mi
244
- let b3 = vendor:: _mm_unpackhi_epi8 ( a2, a3) . into_bits ( ) ; // 3210 pl / 3210 ok
247
+ let b0 = vendor:: _mm_unpacklo_epi8 ( a0, a1) ; // 3210 fb / 3210 ea
248
+ let b1 = vendor:: _mm_unpackhi_epi8 ( a0, a1) ; // 3210 hd / 3210 gc
249
+ let b2 = vendor:: _mm_unpacklo_epi8 ( a2, a3) ; // 3210 nj / 3210 mi
250
+ let b3 = vendor:: _mm_unpackhi_epi8 ( a2, a3) ; // 3210 pl / 3210 ok
245
251
246
252
// [ 3g 3c 3e 3a ] [ 2g 2c 2e 2a ] [ 1g 1c 1e 1a ] [ 0g 0c 0e 0a ]
247
- let c0 = vendor:: _mm_unpacklo_epi16 ( b0, b1) . into_bits ( ) ; // 3210 gcea
248
- let c1 = vendor:: _mm_unpackhi_epi16 ( b0, b1) . into_bits ( ) ; // 3210 hdfb
249
- let c2 = vendor:: _mm_unpacklo_epi16 ( b2, b3) . into_bits ( ) ; // 3210 okmi
250
- let c3 = vendor:: _mm_unpackhi_epi16 ( b2, b3) . into_bits ( ) ; // 3210 plnj
253
+ let c0 = vendor:: _mm_unpacklo_epi16 ( b0, b1) ; // 3210 gcea
254
+ let c1 = vendor:: _mm_unpackhi_epi16 ( b0, b1) ; // 3210 hdfb
255
+ let c2 = vendor:: _mm_unpacklo_epi16 ( b2, b3) ; // 3210 okmi
256
+ let c3 = vendor:: _mm_unpackhi_epi16 ( b2, b3) ; // 3210 plnj
251
257
252
258
// [ 1h 1d 1f 1b 1g 1c 1e 1a ] [ 0h 0d 0f 0b 0g 0c 0e 0a ]
253
- let d0 = vendor:: _mm_unpacklo_epi32 ( c0, c1) . into_bits ( ) ; // 10 hdfbgcea
254
- let d1 = vendor:: _mm_unpackhi_epi32 ( c0, c1) . into_bits ( ) ; // 32 hdfbgcea
255
- let d2 = vendor:: _mm_unpacklo_epi32 ( c2, c3) . into_bits ( ) ; // 10 plnjokmi
256
- let d3 = vendor:: _mm_unpackhi_epi32 ( c2, c3) . into_bits ( ) ; // 32 plnjokmi
259
+ let d0 = vendor:: _mm_unpacklo_epi32 ( c0, c1) ; // 10 hdfbgcea
260
+ let d1 = vendor:: _mm_unpackhi_epi32 ( c0, c1) ; // 32 hdfbgcea
261
+ let d2 = vendor:: _mm_unpacklo_epi32 ( c2, c3) ; // 10 plnjokmi
262
+ let d3 = vendor:: _mm_unpackhi_epi32 ( c2, c3) ; // 32 plnjokmi
257
263
258
264
// [ 0p 0l 0n 0j 0o 0k 0m 0i 0h 0d 0f 0b 0g 0c 0e 0a ]
259
- let e0 = vendor:: _mm_unpacklo_epi64 ( d0, d2) . into_bits ( ) ; // 0 plnjokmihdfbgcea
260
- let e1 = vendor:: _mm_unpackhi_epi64 ( d0, d2) . into_bits ( ) ; // 1 plnjokmihdfbgcea
261
- let e2 = vendor:: _mm_unpacklo_epi64 ( d1, d3) . into_bits ( ) ; // 2 plnjokmihdfbgcea
262
- let e3 = vendor:: _mm_unpackhi_epi64 ( d1, d3) . into_bits ( ) ; // 3 plnjokmihdfbgcea
265
+ let e0 = vendor:: _mm_unpacklo_epi64 ( d0, d2) ; // 0 plnjokmihdfbgcea
266
+ let e1 = vendor:: _mm_unpackhi_epi64 ( d0, d2) ; // 1 plnjokmihdfbgcea
267
+ let e2 = vendor:: _mm_unpacklo_epi64 ( d1, d3) ; // 2 plnjokmihdfbgcea
268
+ let e3 = vendor:: _mm_unpackhi_epi64 ( d1, d3) ; // 3 plnjokmihdfbgcea
263
269
264
270
let f = self . apply :: < simd16:: Simd16Mode > ( [
265
- simd16:: Simd16U8 ( e0 ) ,
266
- simd16:: Simd16U8 ( e1 ) ,
267
- simd16:: Simd16U8 ( e2 ) ,
268
- simd16:: Simd16U8 ( e3 ) ,
271
+ simd16:: Simd16U8 ( transmute ( e0 ) ) ,
272
+ simd16:: Simd16U8 ( transmute ( e1 ) ) ,
273
+ simd16:: Simd16U8 ( transmute ( e2 ) ) ,
274
+ simd16:: Simd16U8 ( transmute ( e3 ) ) ,
269
275
] ) ;
270
276
271
- let f0 = f[ 0 ] . 0 . into_bits ( ) ;
272
- let f1 = f[ 1 ] . 0 . into_bits ( ) ;
273
- let f2 = f[ 2 ] . 0 . into_bits ( ) ;
274
- let f3 = f[ 3 ] . 0 . into_bits ( ) ;
277
+ let f0 = transmute ( f[ 0 ] . 0 ) ;
278
+ let f1 = transmute ( f[ 1 ] . 0 ) ;
279
+ let f2 = transmute ( f[ 2 ] . 0 ) ;
280
+ let f3 = transmute ( f[ 3 ] . 0 ) ;
275
281
276
282
// [ 1h 0h ] [ 1d 0d ] [ 1f 0f ] [ 1b 0b ] [ 1g 0g ] [ 1c 0c ] [ 1e 0e ] [ 1a 0a ]
277
- let g0 = vendor:: _mm_unpacklo_epi8 ( f0, f1) . into_bits ( ) ; // hdfbgcea 10
278
- let g1 = vendor:: _mm_unpackhi_epi8 ( f0, f1) . into_bits ( ) ; // plnjokmi 10
279
- let g2 = vendor:: _mm_unpacklo_epi8 ( f2, f3) . into_bits ( ) ; // hdfbgcea 32
280
- let g3 = vendor:: _mm_unpackhi_epi8 ( f2, f3) . into_bits ( ) ; // plnjokmi 32
283
+ let g0 = vendor:: _mm_unpacklo_epi8 ( f0, f1) ; // hdfbgcea 10
284
+ let g1 = vendor:: _mm_unpackhi_epi8 ( f0, f1) ; // plnjokmi 10
285
+ let g2 = vendor:: _mm_unpacklo_epi8 ( f2, f3) ; // hdfbgcea 32
286
+ let g3 = vendor:: _mm_unpackhi_epi8 ( f2, f3) ; // plnjokmi 32
281
287
282
288
// [ 3g 2g 1g 0g ] [ 3c 2c 1c 0c ] [ 3e 2e 1e 0e ] [ 3a 2a 1a 0a ]
283
- let h0 = vendor:: _mm_unpacklo_epi16 ( g0, g2) . into_bits ( ) ; // gcea 3210
284
- let h1 = vendor:: _mm_unpackhi_epi16 ( g0, g2) . into_bits ( ) ; // hdfb 3210
285
- let h2 = vendor:: _mm_unpacklo_epi16 ( g1, g3) . into_bits ( ) ; // okmi 3210
286
- let h3 = vendor:: _mm_unpackhi_epi16 ( g1, g3) . into_bits ( ) ; // plnj 3210
287
-
288
- let i0 = vendor:: _mm_unpacklo_epi32 ( h0, h1) . into_bits ( ) ; // feba 3210
289
- let i1 = vendor:: _mm_unpackhi_epi32 ( h0, h1) . into_bits ( ) ; // hgdc 3210
290
- let i2 = vendor:: _mm_unpacklo_epi32 ( h2, h3) . into_bits ( ) ; // nmji 3210
291
- let i3 = vendor:: _mm_unpackhi_epi32 ( h2, h3) . into_bits ( ) ; // polk 3210
292
-
293
- let j0 = vendor:: _mm_unpacklo_epi64 ( i0, i1) . into_bits ( ) ; // dcba 3210
294
- let j1 = vendor:: _mm_unpackhi_epi64 ( i0, i1) . into_bits ( ) ; // hgfe 3210
295
- let j2 = vendor:: _mm_unpacklo_epi64 ( i2, i3) . into_bits ( ) ; // lkji 3210
296
- let j3 = vendor:: _mm_unpackhi_epi64 ( i2, i3) . into_bits ( ) ; // ponm 3210
289
+ let h0 = vendor:: _mm_unpacklo_epi16 ( g0, g2) ; // gcea 3210
290
+ let h1 = vendor:: _mm_unpackhi_epi16 ( g0, g2) ; // hdfb 3210
291
+ let h2 = vendor:: _mm_unpacklo_epi16 ( g1, g3) ; // okmi 3210
292
+ let h3 = vendor:: _mm_unpackhi_epi16 ( g1, g3) ; // plnj 3210
293
+
294
+ let i0 = vendor:: _mm_unpacklo_epi32 ( h0, h1) ; // feba 3210
295
+ let i1 = vendor:: _mm_unpackhi_epi32 ( h0, h1) ; // hgdc 3210
296
+ let i2 = vendor:: _mm_unpacklo_epi32 ( h2, h3) ; // nmji 3210
297
+ let i3 = vendor:: _mm_unpackhi_epi32 ( h2, h3) ; // polk 3210
298
+
299
+ let j0 = vendor:: _mm_unpacklo_epi64 ( i0, i1) ; // dcba 3210
300
+ let j1 = vendor:: _mm_unpackhi_epi64 ( i0, i1) ; // hgfe 3210
301
+ let j2 = vendor:: _mm_unpacklo_epi64 ( i2, i3) ; // lkji 3210
302
+ let j3 = vendor:: _mm_unpackhi_epi64 ( i2, i3) ; // ponm 3210
297
303
298
304
[ j0, j1, j2, j3]
299
305
}
0 commit comments