@@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
114
114
simd_select_bitmask ( k, vpmadd52luq_512 ( a, b, c) , _mm512_setzero_si512 ( ) )
115
115
}
116
116
117
+ /// Multiply packed unsigned 52-bit integers in each 64-bit element of
118
+ /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
119
+ /// unsigned integer from the intermediate result with the
120
+ /// corresponding unsigned 64-bit integer in `a`, and store the
121
+ /// results in `dst`.
122
+ ///
123
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
124
+ #[ inline]
125
+ #[ target_feature( enable = "avxifma" ) ]
126
+ #[ unstable( feature = "stdarch_x86_avx512" , issue = "111137" ) ]
127
+ #[ cfg_attr(
128
+ all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
129
+ assert_instr( vpmadd52huq)
130
+ ) ]
131
+ pub unsafe fn _mm256_madd52hi_avx_epu64 ( a : __m256i , b : __m256i , c : __m256i ) -> __m256i {
132
+ vpmadd52huq_256 ( a, b, c)
133
+ }
134
+
117
135
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
118
136
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
119
137
/// unsigned integer from the intermediate result with the
@@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
169
187
simd_select_bitmask ( k, vpmadd52huq_256 ( a, b, c) , _mm256_setzero_si256 ( ) )
170
188
}
171
189
190
+ /// Multiply packed unsigned 52-bit integers in each 64-bit element of
191
+ /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
192
+ /// unsigned integer from the intermediate result with the
193
+ /// corresponding unsigned 64-bit integer in `a`, and store the
194
+ /// results in `dst`.
195
+ ///
196
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
197
+ #[ inline]
198
+ #[ target_feature( enable = "avxifma" ) ]
199
+ #[ unstable( feature = "stdarch_x86_avx512" , issue = "111137" ) ]
200
+ #[ cfg_attr(
201
+ all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
202
+ assert_instr( vpmadd52luq)
203
+ ) ]
204
+ pub unsafe fn _mm256_madd52lo_avx_epu64 ( a : __m256i , b : __m256i , c : __m256i ) -> __m256i {
205
+ vpmadd52luq_256 ( a, b, c)
206
+ }
207
+
172
208
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
173
209
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
174
210
/// unsigned integer from the intermediate result with the
@@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
224
260
simd_select_bitmask ( k, vpmadd52luq_256 ( a, b, c) , _mm256_setzero_si256 ( ) )
225
261
}
226
262
263
+ /// Multiply packed unsigned 52-bit integers in each 64-bit element of
264
+ /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
265
+ /// unsigned integer from the intermediate result with the
266
+ /// corresponding unsigned 64-bit integer in `a`, and store the
267
+ /// results in `dst`.
268
+ ///
269
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
270
+ #[ inline]
271
+ #[ target_feature( enable = "avxifma" ) ]
272
+ #[ unstable( feature = "stdarch_x86_avx512" , issue = "111137" ) ]
273
+ #[ cfg_attr(
274
+ all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
275
+ assert_instr( vpmadd52huq)
276
+ ) ]
277
+ pub unsafe fn _mm_madd52hi_avx_epu64 ( a : __m128i , b : __m128i , c : __m128i ) -> __m128i {
278
+ vpmadd52huq_128 ( a, b, c)
279
+ }
280
+
227
281
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
228
282
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
229
283
/// unsigned integer from the intermediate result with the
@@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
269
323
simd_select_bitmask ( k, vpmadd52huq_128 ( a, b, c) , _mm_setzero_si128 ( ) )
270
324
}
271
325
326
+ /// Multiply packed unsigned 52-bit integers in each 64-bit element of
327
+ /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
328
+ /// unsigned integer from the intermediate result with the
329
+ /// corresponding unsigned 64-bit integer in `a`, and store the
330
+ /// results in `dst`.
331
+ ///
332
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
333
+ #[ inline]
334
+ #[ target_feature( enable = "avxifma" ) ]
335
+ #[ unstable( feature = "stdarch_x86_avx512" , issue = "111137" ) ]
336
+ #[ cfg_attr(
337
+ all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
338
+ assert_instr( vpmadd52luq)
339
+ ) ]
340
+ pub unsafe fn _mm_madd52lo_avx_epu64 ( a : __m128i , b : __m128i , c : __m128i ) -> __m128i {
341
+ vpmadd52luq_128 ( a, b, c)
342
+ }
343
+
272
344
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
273
345
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
274
346
/// unsigned integer from the intermediate result with the
@@ -427,6 +499,20 @@ mod tests {
427
499
assert_eq_m512i ( expected, actual) ;
428
500
}
429
501
502
+ #[ simd_test( enable = "avxifma" ) ]
503
+ unsafe fn test_mm256_madd52hi_avx_epu64 ( ) {
504
+ let a = _mm256_set1_epi64x ( 10 << 40 ) ;
505
+ let b = _mm256_set1_epi64x ( ( 11 << 40 ) + 4 ) ;
506
+ let c = _mm256_set1_epi64x ( ( 12 << 40 ) + 3 ) ;
507
+
508
+ let actual = _mm256_madd52hi_avx_epu64 ( a, b, c) ;
509
+
510
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
511
+ let expected = _mm256_set1_epi64x ( 11030549757952 ) ;
512
+
513
+ assert_eq_m256i ( expected, actual) ;
514
+ }
515
+
430
516
#[ simd_test( enable = "avx512ifma,avx512vl" ) ]
431
517
unsafe fn test_mm256_madd52hi_epu64 ( ) {
432
518
let a = _mm256_set1_epi64x ( 10 << 40 ) ;
@@ -471,6 +557,20 @@ mod tests {
471
557
assert_eq_m256i ( expected, actual) ;
472
558
}
473
559
560
+ #[ simd_test( enable = "avxifma" ) ]
561
+ unsafe fn test_mm256_madd52lo_avx_epu64 ( ) {
562
+ let a = _mm256_set1_epi64x ( 10 << 40 ) ;
563
+ let b = _mm256_set1_epi64x ( ( 11 << 40 ) + 4 ) ;
564
+ let c = _mm256_set1_epi64x ( ( 12 << 40 ) + 3 ) ;
565
+
566
+ let actual = _mm256_madd52lo_avx_epu64 ( a, b, c) ;
567
+
568
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
569
+ let expected = _mm256_set1_epi64x ( 100055558127628 ) ;
570
+
571
+ assert_eq_m256i ( expected, actual) ;
572
+ }
573
+
474
574
#[ simd_test( enable = "avx512ifma,avx512vl" ) ]
475
575
unsafe fn test_mm256_madd52lo_epu64 ( ) {
476
576
let a = _mm256_set1_epi64x ( 10 << 40 ) ;
@@ -515,6 +615,20 @@ mod tests {
515
615
assert_eq_m256i ( expected, actual) ;
516
616
}
517
617
618
+ #[ simd_test( enable = "avxifma" ) ]
619
+ unsafe fn test_mm_madd52hi_avx_epu64 ( ) {
620
+ let a = _mm_set1_epi64x ( 10 << 40 ) ;
621
+ let b = _mm_set1_epi64x ( ( 11 << 40 ) + 4 ) ;
622
+ let c = _mm_set1_epi64x ( ( 12 << 40 ) + 3 ) ;
623
+
624
+ let actual = _mm_madd52hi_avx_epu64 ( a, b, c) ;
625
+
626
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
627
+ let expected = _mm_set1_epi64x ( 11030549757952 ) ;
628
+
629
+ assert_eq_m128i ( expected, actual) ;
630
+ }
631
+
518
632
#[ simd_test( enable = "avx512ifma,avx512vl" ) ]
519
633
unsafe fn test_mm_madd52hi_epu64 ( ) {
520
634
let a = _mm_set1_epi64x ( 10 << 40 ) ;
@@ -559,6 +673,20 @@ mod tests {
559
673
assert_eq_m128i ( expected, actual) ;
560
674
}
561
675
676
+ #[ simd_test( enable = "avxifma" ) ]
677
+ unsafe fn test_mm_madd52lo_avx_epu64 ( ) {
678
+ let a = _mm_set1_epi64x ( 10 << 40 ) ;
679
+ let b = _mm_set1_epi64x ( ( 11 << 40 ) + 4 ) ;
680
+ let c = _mm_set1_epi64x ( ( 12 << 40 ) + 3 ) ;
681
+
682
+ let actual = _mm_madd52lo_avx_epu64 ( a, b, c) ;
683
+
684
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
685
+ let expected = _mm_set1_epi64x ( 100055558127628 ) ;
686
+
687
+ assert_eq_m128i ( expected, actual) ;
688
+ }
689
+
562
690
#[ simd_test( enable = "avx512ifma,avx512vl" ) ]
563
691
unsafe fn test_mm_madd52lo_epu64 ( ) {
564
692
let a = _mm_set1_epi64x ( 10 << 40 ) ;
0 commit comments