Skip to content

Commit 351e787

Browse files
committed
Common: Fix a bunch of errors in ARM vector wrapper
ARM64 passes now at least.
1 parent 3fc563e commit 351e787

File tree

2 files changed

+59
-40
lines changed

2 files changed

+59
-40
lines changed

src/common-tests/gsvector_tests.cpp

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -194,24 +194,12 @@ TEST(GSVector2iTest, UnpackOperations)
194194
EXPECT_EQ(upl8_result.U8[5], 0);
195195
EXPECT_EQ(upl8_result.U8[6], 0);
196196
EXPECT_EQ(upl8_result.U8[7], 0);
197-
EXPECT_EQ(upl8_result.U8[8], 0x56);
198-
EXPECT_EQ(upl8_result.U8[9], 0);
199-
EXPECT_EQ(upl8_result.U8[10], 0);
200-
EXPECT_EQ(upl8_result.U8[11], 0);
201-
EXPECT_EQ(upl8_result.U8[12], 0x78);
202-
EXPECT_EQ(upl8_result.U8[13], 0);
203-
EXPECT_EQ(upl8_result.U8[14], 0);
204-
EXPECT_EQ(upl8_result.U8[15], 0);
205197

206198
auto upl16_result = v1.upl16();
207199
EXPECT_EQ(upl16_result.U16[0], 0x12);
208200
EXPECT_EQ(upl16_result.U16[1], 0);
209201
EXPECT_EQ(upl16_result.U16[2], 0x34);
210202
EXPECT_EQ(upl16_result.U16[3], 0);
211-
EXPECT_EQ(upl16_result.U16[4], 0x56);
212-
EXPECT_EQ(upl16_result.U16[5], 0);
213-
EXPECT_EQ(upl16_result.U16[6], 0x78);
214-
EXPECT_EQ(upl16_result.U16[7], 0);
215203
}
216204

217205
TEST(GSVector2iTest, TypeConversions)
@@ -806,20 +794,28 @@ TEST(GSVector4iTest, Shift64BitOperations)
806794
#ifdef GSVECTOR_HAS_SRLV
807795
TEST(GSVector4iTest, VariableShifts)
808796
{
809-
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000);
810-
GSVector4i shift_amounts(1, 2, 3, 4);
797+
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000, 0x1000, 0x2000, 0x4000, 0x8000);
798+
GSVector4i shift_amounts(1, 2, 3, 4, 1, 2, 3, 4);
811799

812800
auto sllv16_result = v1.sllv16(shift_amounts);
813801
EXPECT_EQ(sllv16_result.U16[0], 0x2000); // 0x1000 << 1
814802
EXPECT_EQ(sllv16_result.U16[1], 0x8000); // 0x2000 << 2
815803
EXPECT_EQ(sllv16_result.U16[2], 0x0000); // 0x4000 << 3 (overflow)
816804
EXPECT_EQ(sllv16_result.U16[3], 0x0000); // 0x8000 << 4 (overflow)
805+
EXPECT_EQ(sllv16_result.U16[4], 0x2000); // 0x1000 << 1
806+
EXPECT_EQ(sllv16_result.U16[5], 0x8000); // 0x2000 << 2
807+
EXPECT_EQ(sllv16_result.U16[6], 0x0000); // 0x4000 << 3 (overflow)
808+
EXPECT_EQ(sllv16_result.U16[7], 0x0000); // 0x8000 << 4 (overflow)
817809

818810
auto srlv16_result = v1.srlv16(shift_amounts);
819811
EXPECT_EQ(srlv16_result.U16[0], 0x0800); // 0x1000 >> 1
820812
EXPECT_EQ(srlv16_result.U16[1], 0x0800); // 0x2000 >> 2
821813
EXPECT_EQ(srlv16_result.U16[2], 0x0800); // 0x4000 >> 3
822814
EXPECT_EQ(srlv16_result.U16[3], 0x0800); // 0x8000 >> 4
815+
EXPECT_EQ(srlv16_result.U16[4], 0x0800); // 0x1000 >> 1
816+
EXPECT_EQ(srlv16_result.U16[5], 0x0800); // 0x2000 >> 2
817+
EXPECT_EQ(srlv16_result.U16[6], 0x0800); // 0x4000 >> 3
818+
EXPECT_EQ(srlv16_result.U16[7], 0x0800); // 0x8000 >> 4
823819
}
824820
#endif
825821

@@ -1512,4 +1508,4 @@ TEST(GSVectorTest, Runion_IsCommutative)
15121508
GSVector4 result2 = rect2.runion(rect1);
15131509

15141510
EXPECT_TRUE(result1.eq(result2));
1515-
}
1511+
}

src/common/gsvector_neon.h

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -401,23 +401,23 @@ class alignas(16) GSVector2i
401401
template<int i>
402402
ALWAYS_INLINE GSVector2i sll() const
403403
{
404-
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
404+
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 8 - i)));
405405
}
406406

407407
template<int i>
408408
ALWAYS_INLINE GSVector2i sll16() const
409409
{
410-
return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
410+
return GSVector2i(vreinterpret_s32_u16(vshl_n_u16(vreinterpret_u16_s32(v2s), i)));
411411
}
412412

413413
ALWAYS_INLINE GSVector2i sll16(s32 i) const
414414
{
415-
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
415+
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_s16(i))));
416416
}
417417

418418
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419419
{
420-
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
420+
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421421
}
422422

423423
template<int i>
@@ -459,9 +459,15 @@ class alignas(16) GSVector2i
459459
return GSVector2i(vshl_n_s32(v2s, i));
460460
}
461461

462-
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
462+
ALWAYS_INLINE GSVector2i sll32(s32 i) const
463+
{
464+
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(i))));
465+
}
463466

464-
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
467+
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const
468+
{
469+
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), v.v2s)));
470+
}
465471

466472
template<int i>
467473
ALWAYS_INLINE GSVector2i srl32() const
@@ -553,6 +559,16 @@ class alignas(16) GSVector2i
553559
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
554560
}
555561

562+
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
563+
{
564+
return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
565+
}
566+
567+
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
568+
{
569+
return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
570+
}
571+
556572
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
557573
{
558574
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
@@ -707,7 +723,7 @@ class alignas(16) GSVector2i
707723
return GSVector2i(vset_lane_s32(val, vdup_n_s32(0), 0));
708724
}
709725

710-
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
726+
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
711727

712728
template<bool aligned>
713729
ALWAYS_INLINE static GSVector2i load(const void* p)
@@ -886,7 +902,7 @@ class alignas(16) GSVector2
886902
template<int mask>
887903
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
888904
{
889-
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
905+
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 2 : 0, (mask & 2) ? 3 : 1));
890906
}
891907

892908
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
@@ -1324,15 +1340,16 @@ class alignas(16) GSVector4i
13241340
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
13251341
{
13261342
#ifdef CPU_ARCH_ARM64
1327-
const int32x4_t acc =
1328-
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1329-
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
1343+
const int32x4_t low =
1344+
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1345+
const int32x4_t high = vmull_high_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
1346+
return GSVector4i(vpaddq_s32(low, high));
13301347
#else
13311348
// borrowed from sse2neon
13321349
const int32x4_t low =
1333-
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1350+
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
13341351
const int32x4_t high =
1335-
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1352+
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
13361353
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
13371354
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
13381355
#endif
@@ -1756,17 +1773,17 @@ class alignas(16) GSVector4i
17561773
template<int i>
17571774
ALWAYS_INLINE GSVector4i sll16() const
17581775
{
1759-
return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
1776+
return GSVector4i(vreinterpretq_s32_u16(vshlq_n_u16(vreinterpretq_u16_s32(v4s), i)));
17601777
}
17611778

17621779
ALWAYS_INLINE GSVector4i sll16(s32 i) const
17631780
{
1764-
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
1781+
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_s16(i))));
17651782
}
17661783

17671784
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
17681785
{
1769-
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1786+
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
17701787
}
17711788

17721789
template<int i>
@@ -1783,7 +1800,7 @@ class alignas(16) GSVector4i
17831800
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
17841801
{
17851802
return GSVector4i(
1786-
vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1803+
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
17871804
}
17881805

17891806
template<int i>
@@ -1810,9 +1827,15 @@ class alignas(16) GSVector4i
18101827
return GSVector4i(vshlq_n_s32(v4s, i));
18111828
}
18121829

1813-
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
1830+
ALWAYS_INLINE GSVector4i sll32(s32 i) const
1831+
{
1832+
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(i))));
1833+
}
18141834

1815-
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
1835+
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const
1836+
{
1837+
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), v.v4s)));
1838+
}
18161839

18171840
template<int i>
18181841
ALWAYS_INLINE GSVector4i srl32() const
@@ -1843,17 +1866,17 @@ class alignas(16) GSVector4i
18431866
template<int i>
18441867
ALWAYS_INLINE GSVector4i sll64() const
18451868
{
1846-
return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
1869+
return GSVector4i(vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(v4s), i)));
18471870
}
18481871

18491872
ALWAYS_INLINE GSVector4i sll64(s32 i) const
18501873
{
1851-
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s64(i))));
1874+
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_s64(i))));
18521875
}
18531876

18541877
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
18551878
{
1856-
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1879+
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
18571880
}
18581881

18591882
template<int i>
@@ -2771,7 +2794,7 @@ class alignas(16) GSVector4
27712794

27722795
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
27732796
{
2774-
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
2797+
return GSVector4(vcombine_f32(vget_high_f32(a.v4s), vget_high_f32(v4s)));
27752798
}
27762799

27772800
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
@@ -3163,7 +3186,7 @@ class alignas(16) GSVector4
31633186
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
31643187
{
31653188
#ifdef CPU_ARCH_ARM64
3166-
return GSVector4(vreinterpretq_f32_u64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3189+
return GSVector4(vreinterpretq_f32_u64(vcltq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
31673190
#else
31683191
GSVector4 ret;
31693192
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
@@ -3230,7 +3253,7 @@ class alignas(16) GSVector4
32303253
ALWAYS_INLINE GSVector4 sqr64() const
32313254
{
32323255
#ifdef CPU_ARCH_ARM64
3233-
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3256+
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v4s))));
32343257
#else
32353258
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
32363259
#endif

0 commit comments

Comments
 (0)