@@ -401,23 +401,23 @@ class alignas(16) GSVector2i
401401 template <int i>
402402 ALWAYS_INLINE GSVector2i sll () const
403403 {
404- return GSVector2i (vreinterpret_s32_s8 (vext_s8 (vdup_n_s8 (0 ), vreinterpret_s8_s32 (v2s), 16 - i)));
404+ return GSVector2i (vreinterpret_s32_s8 (vext_s8 (vdup_n_s8 (0 ), vreinterpret_s8_s32 (v2s), 8 - i)));
405405 }
406406
407407 template <int i>
408408 ALWAYS_INLINE GSVector2i sll16 () const
409409 {
410- return GSVector2i (vreinterpret_s32_s16 ( vshl_n_s16 ( vreinterpret_s16_s32 (v2s), i)));
410+ return GSVector2i (vreinterpret_s32_u16 ( vshl_n_u16 ( vreinterpret_u16_s32 (v2s), i)));
411411 }
412412
413413 ALWAYS_INLINE GSVector2i sll16 (s32 i) const
414414 {
415- return GSVector2i (vreinterpret_s32_s16 ( vshl_s16 ( vreinterpret_s16_s32 (v2s), vdup_n_s16 (i))));
415+ return GSVector2i (vreinterpret_s32_u16 ( vshl_u16 ( vreinterpret_u16_s32 (v2s), vdup_n_s16 (i))));
416416 }
417417
418418 ALWAYS_INLINE GSVector2i sllv16 (const GSVector2i& v) const
419419 {
420- return GSVector2i (vreinterpret_s32_s16 ( vshl_s16 ( vreinterpret_s16_s32 (v2s), vreinterpret_s16_s32 (v.v2s ))));
420+ return GSVector2i (vreinterpret_s32_u16 ( vshl_u16 ( vreinterpret_u16_s32 (v2s), vreinterpret_s16_s32 (v.v2s ))));
421421 }
422422
423423 template <int i>
@@ -459,9 +459,15 @@ class alignas(16) GSVector2i
459459 return GSVector2i (vshl_n_s32 (v2s, i));
460460 }
461461
462- ALWAYS_INLINE GSVector2i sll32 (s32 i) const { return GSVector2i (vshl_s32 (v2s, vdup_n_s32 (i))); }
462+ ALWAYS_INLINE GSVector2i sll32 (s32 i) const
463+ {
464+ return GSVector2i (vreinterpret_s32_u32 (vshl_u32 (vreinterpret_u32_s32 (v2s), vdup_n_s32 (i))));
465+ }
463466
464- ALWAYS_INLINE GSVector2i sllv32 (const GSVector2i& v) const { return GSVector2i (vshl_s32 (v2s, v.v2s )); }
467+ ALWAYS_INLINE GSVector2i sllv32 (const GSVector2i& v) const
468+ {
469+ return GSVector2i (vreinterpret_s32_u32 (vshl_u32 (vreinterpret_u32_s32 (v2s), v.v2s )));
470+ }
465471
466472 template <int i>
467473 ALWAYS_INLINE GSVector2i srl32 () const
@@ -553,6 +559,16 @@ class alignas(16) GSVector2i
553559 return GSVector2i (vreinterpret_s32_u16 (vqsub_u16 (vreinterpret_u16_s32 (v2s), vreinterpret_u16_s32 (v.v2s ))));
554560 }
555561
562+ ALWAYS_INLINE GSVector2i avg8 (const GSVector2i& v) const
563+ {
564+ return GSVector2i (vreinterpret_s32_u8 (vrhadd_u8 (vreinterpret_u8_s32 (v2s), vreinterpret_u8_s32 (v.v2s ))));
565+ }
566+
567+ ALWAYS_INLINE GSVector2i avg16 (const GSVector2i& v) const
568+ {
569+ return GSVector2i (vreinterpret_s32_u16 (vrhadd_u16 (vreinterpret_u16_s32 (v2s), vreinterpret_u16_s32 (v.v2s ))));
570+ }
571+
556572 ALWAYS_INLINE GSVector2i mul16l (const GSVector2i& v) const
557573 {
558574 return GSVector2i (vreinterpret_s32_s16 (vmul_s16 (vreinterpret_s16_s32 (v2s), vreinterpret_s16_s32 (v.v2s ))));
@@ -707,7 +723,7 @@ class alignas(16) GSVector2i
707723 return GSVector2i (vset_lane_s32 (val, vdup_n_s32 (0 ), 0 ));
708724 }
709725
710- ALWAYS_INLINE static GSVector2i zext32 (s32 v) { return GSVector2i (vset_lane_s32 (v, vdup_n_s32 (0 ), 0 )); }
726+ ALWAYS_INLINE static GSVector2i set32 (s32 v) { return GSVector2i (vset_lane_s32 (v, vdup_n_s32 (0 ), 0 )); }
711727
712728 template <bool aligned>
713729 ALWAYS_INLINE static GSVector2i load (const void * p)
@@ -886,7 +902,7 @@ class alignas(16) GSVector2
886902 template <int mask>
887903 ALWAYS_INLINE GSVector2 blend32 (const GSVector2& a) const
888904 {
889- return GSVector2 (__builtin_shufflevector (v2s, a.v2s , (mask & 1 ) ? 4 : 0 , (mask & 2 ) ? 5 : 1 ));
905+ return GSVector2 (__builtin_shufflevector (v2s, a.v2s , (mask & 1 ) ? 2 : 0 , (mask & 2 ) ? 3 : 1 ));
890906 }
891907
892908 ALWAYS_INLINE GSVector2 blend32 (const GSVector2& a, const GSVector2& mask) const
@@ -1324,15 +1340,16 @@ class alignas(16) GSVector4i
13241340 ALWAYS_INLINE GSVector4i madd_s16 (const GSVector4i& v) const
13251341 {
13261342#ifdef CPU_ARCH_ARM64
1327- const int32x4_t acc =
1328- vmlal_s16 (vdupq_n_s32 (0 ), vget_low_s16 (vreinterpretq_s16_s32 (v4s)), vget_low_s16 (vreinterpretq_s16_s32 (v.v4s )));
1329- return GSVector4i (vmlal_high_s16 (acc, vreinterpretq_s16_s32 (v4s), vreinterpretq_s16_s32 (v.v4s )));
1343+ const int32x4_t low =
1344+ vmull_s16 (vget_low_s16 (vreinterpretq_s16_s32 (v4s)), vget_low_s16 (vreinterpretq_s16_s32 (v.v4s )));
1345+ const int32x4_t high = vmull_high_s16 (vreinterpretq_s16_s32 (v4s), vreinterpretq_s16_s32 (v.v4s ));
1346+ return GSVector4i (vpaddq_s32 (low, high));
13301347#else
13311348 // borrowed from sse2neon
13321349 const int32x4_t low =
1333- vmlal_s16 ( vdupq_n_s32 ( 0 ), vget_low_s16 (vreinterpretq_s16_s32 (v4s)), vget_low_s16 (vreinterpretq_s16_s32 (v.v4s )));
1350+ vmull_s16 ( vget_low_s16 (vreinterpretq_s16_s32 (v4s)), vget_low_s16 (vreinterpretq_s16_s32 (v.v4s )));
13341351 const int32x4_t high =
1335- vmlal_s16 ( vdupq_n_s32 ( 0 ), vget_high_s16 (vreinterpretq_s16_s32 (v4s)), vget_high_s16 (vreinterpretq_s16_s32 (v.v4s )));
1352+ vmull_s16 ( vget_high_s16 (vreinterpretq_s16_s32 (v4s)), vget_high_s16 (vreinterpretq_s16_s32 (v.v4s )));
13361353 return GSVector4i (vcombine_s32 (vpadd_s32 (vget_low_s32 (low), vget_high_s32 (low)),
13371354 vpadd_s32 (vget_low_s32 (high), vget_high_s32 (high))));
13381355#endif
@@ -1756,17 +1773,17 @@ class alignas(16) GSVector4i
17561773 template <int i>
17571774 ALWAYS_INLINE GSVector4i sll16 () const
17581775 {
1759- return GSVector4i (vreinterpretq_s32_s16 ( vshlq_n_s16 ( vreinterpretq_s16_s32 (v4s), i)));
1776+ return GSVector4i (vreinterpretq_s32_u16 ( vshlq_n_u16 ( vreinterpretq_u16_s32 (v4s), i)));
17601777 }
17611778
17621779 ALWAYS_INLINE GSVector4i sll16 (s32 i) const
17631780 {
1764- return GSVector4i (vreinterpretq_s32_s16 ( vshlq_s16 ( vreinterpretq_s16_s32 (v4s), vdupq_n_s16 (i))));
1781+ return GSVector4i (vreinterpretq_s32_u16 ( vshlq_u16 ( vreinterpretq_u16_s32 (v4s), vdupq_n_s16 (i))));
17651782 }
17661783
17671784 ALWAYS_INLINE GSVector4i sllv16 (const GSVector4i& v) const
17681785 {
1769- return GSVector4i (vreinterpretq_s32_s16 ( vshlq_s16 ( vreinterpretq_s16_s32 (v4s), vreinterpretq_s16_s32 (v.v4s ))));
1786+ return GSVector4i (vreinterpretq_s32_u16 ( vshlq_u16 ( vreinterpretq_u16_s32 (v4s), vreinterpretq_u16_s32 (v.v4s ))));
17701787 }
17711788
17721789 template <int i>
@@ -1783,7 +1800,7 @@ class alignas(16) GSVector4i
17831800 ALWAYS_INLINE GSVector4i srlv16 (const GSVector4i& v) const
17841801 {
17851802 return GSVector4i (
1786- vreinterpretq_s32_s16 ( vshlq_s16 ( vreinterpretq_s16_s32 (v4s), vnegq_s16 (vreinterpretq_s16_s32 (v.v4s )))));
1803+ vreinterpretq_s32_u16 ( vshlq_u16 ( vreinterpretq_u16_s32 (v4s), vnegq_s16 (vreinterpretq_s16_s32 (v.v4s )))));
17871804 }
17881805
17891806 template <int i>
@@ -1810,9 +1827,15 @@ class alignas(16) GSVector4i
18101827 return GSVector4i (vshlq_n_s32 (v4s, i));
18111828 }
18121829
1813- ALWAYS_INLINE GSVector4i sll32 (s32 i) const { return GSVector4i (vshlq_s32 (v4s, vdupq_n_s32 (i))); }
1830+ ALWAYS_INLINE GSVector4i sll32 (s32 i) const
1831+ {
1832+ return GSVector4i (vreinterpretq_s32_u32 (vshlq_u32 (vreinterpretq_u32_s32 (v4s), vdupq_n_s32 (i))));
1833+ }
18141834
1815- ALWAYS_INLINE GSVector4i sllv32 (const GSVector4i& v) const { return GSVector4i (vshlq_s32 (v4s, v.v4s )); }
1835+ ALWAYS_INLINE GSVector4i sllv32 (const GSVector4i& v) const
1836+ {
1837+ return GSVector4i (vreinterpretq_s32_u32 (vshlq_u32 (vreinterpretq_u32_s32 (v4s), v.v4s )));
1838+ }
18161839
18171840 template <int i>
18181841 ALWAYS_INLINE GSVector4i srl32 () const
@@ -1843,17 +1866,17 @@ class alignas(16) GSVector4i
18431866 template <int i>
18441867 ALWAYS_INLINE GSVector4i sll64 () const
18451868 {
1846- return GSVector4i (vreinterpretq_s32_s64 ( vshlq_n_s64 ( vreinterpretq_s64_s32 (v4s), i)));
1869+ return GSVector4i (vreinterpretq_s32_u64 ( vshlq_n_u64 ( vreinterpretq_u64_s32 (v4s), i)));
18471870 }
18481871
18491872 ALWAYS_INLINE GSVector4i sll64 (s32 i) const
18501873 {
1851- return GSVector4i (vreinterpretq_s32_s64 ( vshlq_s64 ( vreinterpretq_s64_s32 (v4s), vdupq_n_s64 (i))));
1874+ return GSVector4i (vreinterpretq_s32_u64 ( vshlq_u64 ( vreinterpretq_u64_s32 (v4s), vdupq_n_s64 (i))));
18521875 }
18531876
18541877 ALWAYS_INLINE GSVector4i sllv64 (const GSVector4i& v) const
18551878 {
1856- return GSVector4i (vreinterpretq_s32_s64 ( vshlq_s64 ( vreinterpretq_s64_s32 (v4s), vreinterpretq_s64_s32 (v.v4s ))));
1879+ return GSVector4i (vreinterpretq_s32_u64 ( vshlq_u64 ( vreinterpretq_u64_s32 (v4s), vreinterpretq_s64_s32 (v.v4s ))));
18571880 }
18581881
18591882 template <int i>
@@ -2771,7 +2794,7 @@ class alignas(16) GSVector4
27712794
27722795 ALWAYS_INLINE GSVector4 h2l (const GSVector4& a) const
27732796 {
2774- return GSVector4 (vcombine_f32 (vget_high_f32 (v4s), vget_high_f32 (a. v4s )));
2797+ return GSVector4 (vcombine_f32 (vget_high_f32 (a. v4s ), vget_high_f32 (v4s)));
27752798 }
27762799
27772800 ALWAYS_INLINE GSVector4 andnot (const GSVector4& v) const
@@ -3163,7 +3186,7 @@ class alignas(16) GSVector4
31633186 ALWAYS_INLINE GSVector4 lt64 (const GSVector4& v) const
31643187 {
31653188#ifdef CPU_ARCH_ARM64
3166- return GSVector4 (vreinterpretq_f32_u64 (vcgtq_f64 (vreinterpretq_f64_f32 (v4s), vreinterpretq_f64_f32 (v.v4s ))));
3189+ return GSVector4 (vreinterpretq_f32_u64 (vcltq_f64 (vreinterpretq_f64_f32 (v4s), vreinterpretq_f64_f32 (v.v4s ))));
31673190#else
31683191 GSVector4 ret;
31693192 ret.U64 [0 ] = (F64[0 ] < v.F64 [0 ]) ? 0xFFFFFFFFFFFFFFFFULL : 0 ;
@@ -3230,7 +3253,7 @@ class alignas(16) GSVector4
32303253 ALWAYS_INLINE GSVector4 sqr64 () const
32313254 {
32323255#ifdef CPU_ARCH_ARM64
3233- return GSVector4 (vreinterpretq_f32_f64 (vsqrtq_f64 ( vreinterpretq_f64_f32 (v4s))));
3256+ return GSVector4 (vreinterpretq_f32_f64 (vmulq_f64 ( vreinterpretq_f64_f32 (v4s), vreinterpretq_f64_f32 (v4s))));
32343257#else
32353258 return GSVector4::f64 (F64[0 ] * F64[0 ], F64[1 ] * F64[1 ]);
32363259#endif
0 commit comments