Skip to content

Commit d3d128c

Browse files
authored
Merge pull request #628 from tigercosmos/neon_simd_1113
add the memory alignment check for NEON SIMD
2 parents 7bd9cd9 + ea7115c commit d3d128c

File tree

3 files changed

+323
-7
lines changed

3 files changed

+323
-7
lines changed

cpp/modmesh/simd/neon/neon.hpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ const T * check_between(T const * start, T const * end, T const & min_val, T con
5959
using cmpvec_t = type::vector_t<uint64_t>;
6060
constexpr size_t N_lane = type::vector_lane<T>;
6161

62+
#ifndef NDEBUG
63+
constexpr size_t alignment = get_recommended_alignment();
64+
detail::check_alignment(start, alignment, "check_between start");
65+
#endif
66+
6267
vec_t max_vec = vdupq(max_val);
6368
vec_t min_vec = vdupq(min_val);
6469
vec_t data_vec = {};
@@ -117,6 +122,14 @@ void add(T * dest, T const * dest_end, T const * src1, T const * src2)
117122
{
118123
using vec_t = type::vector_t<T>;
119124
constexpr size_t N_lane = type::vector_lane<T>;
125+
126+
#ifndef NDEBUG
127+
constexpr size_t alignment = get_recommended_alignment();
128+
detail::check_alignment(dest, alignment, "add dest");
129+
detail::check_alignment(src1, alignment, "add src1");
130+
detail::check_alignment(src2, alignment, "add src2");
131+
#endif
132+
120133
vec_t src1_vec;
121134
vec_t src2_vec;
122135
vec_t res_vec;
@@ -146,6 +159,14 @@ void sub(T * dest, T const * dest_end, T const * src1, T const * src2)
146159
{
147160
using vec_t = type::vector_t<T>;
148161
constexpr size_t N_lane = type::vector_lane<T>;
162+
163+
#ifndef NDEBUG
164+
constexpr size_t alignment = get_recommended_alignment();
165+
detail::check_alignment(dest, alignment, "sub dest");
166+
detail::check_alignment(src1, alignment, "sub src1");
167+
detail::check_alignment(src2, alignment, "sub src2");
168+
#endif
169+
149170
vec_t src1_vec;
150171
vec_t src2_vec;
151172
vec_t res_vec;
@@ -175,6 +196,14 @@ void mul(T * dest, T const * dest_end, T const * src1, T const * src2)
175196
{
176197
using vec_t = type::vector_t<T>;
177198
constexpr size_t N_lane = type::vector_lane<T>;
199+
200+
#ifndef NDEBUG
201+
constexpr size_t alignment = get_recommended_alignment();
202+
detail::check_alignment(dest, alignment, "mul dest");
203+
detail::check_alignment(src1, alignment, "mul src1");
204+
detail::check_alignment(src2, alignment, "mul src2");
205+
#endif
206+
178207
vec_t src1_vec;
179208
vec_t src2_vec;
180209
vec_t res_vec;
@@ -204,6 +233,14 @@ void div(T * dest, T const * dest_end, T const * src1, T const * src2)
204233
{
205234
using vec_t = type::vector_t<T>;
206235
constexpr size_t N_lane = type::vector_lane<T>;
236+
237+
#ifndef NDEBUG
238+
constexpr size_t alignment = get_recommended_alignment();
239+
detail::check_alignment(dest, alignment, "div dest");
240+
detail::check_alignment(src1, alignment, "div src1");
241+
detail::check_alignment(src2, alignment, "div src2");
242+
#endif
243+
207244
vec_t src1_vec;
208245
vec_t src2_vec;
209246
vec_t res_vec;

cpp/modmesh/simd/simd.hpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
* POSSIBILITY OF SUCH DAMAGE.
2929
*/
3030

31-
#include <modmesh/simd/simd_support.hpp>
3231
#include <modmesh/simd/simd_generic.hpp>
32+
#include <modmesh/simd/simd_support.hpp>
3333

3434
#include <modmesh/simd/neon/neon.hpp>
3535

@@ -39,6 +39,48 @@ namespace modmesh
3939
namespace simd
4040
{
4141

42+
namespace detail
43+
{
44+
#ifndef NDEBUG
45+
template <typename T>
46+
bool is_aligned(T const * pointer, size_t alignment)
47+
{
48+
return (reinterpret_cast<std::uintptr_t>(pointer) % alignment) == 0;
49+
}
50+
51+
template <typename T>
52+
void check_alignment(T const * pointer, size_t required_alignment, const char * name)
53+
{
54+
if (!is_aligned(pointer, required_alignment))
55+
{
56+
std::fprintf(stderr,
57+
"Warning: %s pointer %p is not aligned to %zu bytes. "
58+
"SIMD performance may be degraded.\n",
59+
name,
60+
static_cast<const void *>(pointer),
61+
required_alignment);
62+
}
63+
}
64+
#endif
65+
66+
// Get the recommended memory alignment for SIMD operations based on the detected SIMD instruction set.
67+
inline constexpr size_t get_recommended_alignment()
68+
{
69+
#if defined(__aarch64__) || defined(__arm__)
70+
return 16;
71+
#elif defined(__AVX512F__)
72+
return 64;
73+
#elif defined(__AVX__) || defined(__AVX2__)
74+
return 32;
75+
#elif defined(__SSE__) || defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__)
76+
return 16;
77+
#else
78+
return 0;
79+
#endif
80+
}
81+
82+
} // namespace detail
83+
4284
// Check if each element from start to end (excluded end) is within the range [min_val, max_val)
4385
template <typename T>
4486
const T * check_between(T const * start, T const * end, T const & min_val, T const & max_val)

tests/test_buffer.py

Lines changed: 243 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,249 @@ def test_alignment_with_different_shapes(self):
12201220
self.assertEqual(64, array3d.alignment)
12211221
self.assertEqual((2, 4, 4), array3d.shape)
12221222

1223+
def test_alignment_with_simd_operations(self):
1224+
size = 16
1225+
alignments = [16, 32, 64]
1226+
1227+
for alignment in alignments:
1228+
arr1 = modmesh.SimpleArrayFloat64((size,), alignment)
1229+
arr2 = modmesh.SimpleArrayFloat64((size,), alignment)
1230+
1231+
self.assertEqual(alignment, arr1.alignment)
1232+
self.assertEqual(alignment, arr2.alignment)
1233+
1234+
for index in range(size):
1235+
arr1[index] = index * 2.0
1236+
arr2[index] = index * 3.0
1237+
1238+
result_add = arr1.add_simd(arr2)
1239+
self.assertEqual(0, result_add.alignment)
1240+
for index in range(size):
1241+
expected = index * 2.0 + index * 3.0
1242+
self.assertAlmostEqual(expected, result_add[index])
1243+
1244+
result_sub = arr1.sub_simd(arr2)
1245+
self.assertEqual(0, result_sub.alignment)
1246+
for index in range(size):
1247+
expected = index * 2.0 - index * 3.0
1248+
self.assertAlmostEqual(expected, result_sub[index])
1249+
1250+
result_mul = arr1.mul_simd(arr2)
1251+
self.assertEqual(0, result_mul.alignment)
1252+
for index in range(size):
1253+
expected = index * 2.0 * index * 3.0
1254+
self.assertAlmostEqual(expected, result_mul[index])
1255+
1256+
result_div = arr2.div_simd(arr1)
1257+
self.assertEqual(0, result_div.alignment)
1258+
for index in range(1, size):
1259+
expected = index * 3.0 / (index * 2.0)
1260+
self.assertAlmostEqual(expected, result_div[index])
1261+
1262+
def test_alignment_with_simd_operations_multidimensional(self):
1263+
alignments = [16, 32, 64]
1264+
1265+
for alignment in alignments:
1266+
array1_2d = modmesh.SimpleArrayFloat64((4, 8), alignment)
1267+
array2_2d = modmesh.SimpleArrayFloat64((4, 8), alignment)
1268+
1269+
self.assertEqual(alignment, array1_2d.alignment)
1270+
self.assertEqual(alignment, array2_2d.alignment)
1271+
self.assertEqual((4, 8), array1_2d.shape)
1272+
self.assertEqual((4, 8), array2_2d.shape)
1273+
1274+
for i in range(4):
1275+
for j in range(8):
1276+
array1_2d[i, j] = (i * 8 + j) * 2.0
1277+
array2_2d[i, j] = (i * 8 + j) * 3.0
1278+
1279+
result_add_2d = array1_2d.add_simd(array2_2d)
1280+
self.assertEqual(0, result_add_2d.alignment)
1281+
self.assertEqual((4, 8), result_add_2d.shape)
1282+
for i in range(4):
1283+
for j in range(8):
1284+
value = i * 8 + j
1285+
expected = value * 2.0 + value * 3.0
1286+
self.assertAlmostEqual(expected, result_add_2d[i, j])
1287+
1288+
result_sub_2d = array1_2d.sub_simd(array2_2d)
1289+
self.assertEqual(0, result_sub_2d.alignment)
1290+
self.assertEqual((4, 8), result_sub_2d.shape)
1291+
for i in range(4):
1292+
for j in range(8):
1293+
value = i * 8 + j
1294+
expected = value * 2.0 - value * 3.0
1295+
self.assertAlmostEqual(expected, result_sub_2d[i, j])
1296+
1297+
result_mul_2d = array1_2d.mul_simd(array2_2d)
1298+
self.assertEqual(0, result_mul_2d.alignment)
1299+
self.assertEqual((4, 8), result_mul_2d.shape)
1300+
for i in range(4):
1301+
for j in range(8):
1302+
value = i * 8 + j
1303+
expected = value * 2.0 * value * 3.0
1304+
self.assertAlmostEqual(expected, result_mul_2d[i, j])
1305+
1306+
result_div_2d = array2_2d.div_simd(array1_2d)
1307+
self.assertEqual(0, result_div_2d.alignment)
1308+
self.assertEqual((4, 8), result_div_2d.shape)
1309+
for i in range(4):
1310+
for j in range(8):
1311+
value = i * 8 + j
1312+
if value > 0:
1313+
expected = value * 3.0 / (value * 2.0)
1314+
self.assertAlmostEqual(expected, result_div_2d[i, j])
1315+
1316+
array1_3d = modmesh.SimpleArrayFloat64((2, 4, 4), alignment)
1317+
array2_3d = modmesh.SimpleArrayFloat64((2, 4, 4), alignment)
1318+
1319+
self.assertEqual(alignment, array1_3d.alignment)
1320+
self.assertEqual(alignment, array2_3d.alignment)
1321+
self.assertEqual((2, 4, 4), array1_3d.shape)
1322+
self.assertEqual((2, 4, 4), array2_3d.shape)
1323+
1324+
for i in range(2):
1325+
for j in range(4):
1326+
for k in range(4):
1327+
array1_3d[i, j, k] = (i * 16 + j * 4 + k) * 2.0
1328+
array2_3d[i, j, k] = (i * 16 + j * 4 + k) * 3.0
1329+
1330+
result_add_3d = array1_3d.add_simd(array2_3d)
1331+
self.assertEqual(0, result_add_3d.alignment)
1332+
self.assertEqual((2, 4, 4), result_add_3d.shape)
1333+
for i in range(2):
1334+
for j in range(4):
1335+
for k in range(4):
1336+
value = i * 16 + j * 4 + k
1337+
expected = value * 2.0 + value * 3.0
1338+
self.assertAlmostEqual(expected,
1339+
result_add_3d[i, j, k])
1340+
1341+
result_sub_3d = array1_3d.sub_simd(array2_3d)
1342+
self.assertEqual(0, result_sub_3d.alignment)
1343+
self.assertEqual((2, 4, 4), result_sub_3d.shape)
1344+
for i in range(2):
1345+
for j in range(4):
1346+
for k in range(4):
1347+
value = i * 16 + j * 4 + k
1348+
expected = value * 2.0 - value * 3.0
1349+
self.assertAlmostEqual(expected,
1350+
result_sub_3d[i, j, k])
1351+
1352+
result_mul_3d = array1_3d.mul_simd(array2_3d)
1353+
self.assertEqual(0, result_mul_3d.alignment)
1354+
self.assertEqual((2, 4, 4), result_mul_3d.shape)
1355+
for i in range(2):
1356+
for j in range(4):
1357+
for k in range(4):
1358+
value = i * 16 + j * 4 + k
1359+
expected = value * 2.0 * value * 3.0
1360+
self.assertAlmostEqual(expected,
1361+
result_mul_3d[i, j, k])
1362+
1363+
result_div_3d = array2_3d.div_simd(array1_3d)
1364+
self.assertEqual(0, result_div_3d.alignment)
1365+
self.assertEqual((2, 4, 4), result_div_3d.shape)
1366+
for i in range(2):
1367+
for j in range(4):
1368+
for k in range(4):
1369+
value = i * 16 + j * 4 + k
1370+
if value > 0:
1371+
expected = value * 3.0 / (value * 2.0)
1372+
self.assertAlmostEqual(expected,
1373+
result_div_3d[i, j, k])
1374+
1375+
def test_alignment_size_validation_multidimensional(self):
1376+
with self.assertRaisesRegex(
1377+
ValueError,
1378+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 16" # noqa E501
1379+
):
1380+
modmesh.SimpleArrayFloat64((1, 3), 16)
1381+
1382+
with self.assertRaisesRegex(
1383+
ValueError,
1384+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 32" # noqa E501
1385+
):
1386+
modmesh.SimpleArrayFloat64((1, 3), 32)
1387+
1388+
with self.assertRaisesRegex(
1389+
ValueError,
1390+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 64" # noqa E501
1391+
):
1392+
modmesh.SimpleArrayFloat64((3, 3), 64)
1393+
1394+
with self.assertRaisesRegex(
1395+
ValueError,
1396+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 16" # noqa E501
1397+
):
1398+
modmesh.SimpleArrayFloat64((1, 1, 1), 16)
1399+
1400+
with self.assertRaisesRegex(
1401+
ValueError,
1402+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 32" # noqa E501
1403+
):
1404+
modmesh.SimpleArrayFloat64((1, 1, 1), 32)
1405+
1406+
with self.assertRaisesRegex(
1407+
ValueError,
1408+
"ConcreteBuffer::allocate: size .* must be a multiple of alignment 64" # noqa E501
1409+
):
1410+
modmesh.SimpleArrayFloat64((2, 3, 5), 64)
1411+
1412+
def test_alignment_with_unaligned_rows(self):
1413+
# 2D arrays that row is not aligned
1414+
array1_2d = modmesh.SimpleArrayFloat64((2, 3), 16)
1415+
array2_2d = modmesh.SimpleArrayFloat64((2, 3), 16)
1416+
1417+
self.assertEqual(16, array1_2d.alignment)
1418+
self.assertEqual(16, array2_2d.alignment)
1419+
self.assertEqual((2, 3), array1_2d.shape)
1420+
1421+
for i in range(2):
1422+
for j in range(3):
1423+
array1_2d[i, j] = (i * 3 + j) * 2.0
1424+
array2_2d[i, j] = (i * 3 + j) * 3.0
1425+
1426+
# SIMD ops must tolerate the unaligned row stride and still yield exact math. # noqa: E501
1427+
result_add = array1_2d.add_simd(array2_2d)
1428+
result_sub = array1_2d.sub_simd(array2_2d)
1429+
result_mul = array1_2d.mul_simd(array2_2d)
1430+
1431+
for i in range(2):
1432+
for j in range(3):
1433+
value = i * 3 + j
1434+
self.assertAlmostEqual(value * 5.0, result_add[i, j])
1435+
self.assertAlmostEqual(value * -1.0, result_sub[i, j])
1436+
self.assertAlmostEqual(value * value * 6.0, result_mul[i, j])
1437+
1438+
# Repeat with 3D data that the innermost dimension is unaligned.
1439+
array1_3d = modmesh.SimpleArrayFloat64((2, 2, 2), 32)
1440+
array2_3d = modmesh.SimpleArrayFloat64((2, 2, 2), 32)
1441+
1442+
self.assertEqual(32, array1_3d.alignment)
1443+
self.assertEqual(32, array2_3d.alignment)
1444+
1445+
for i in range(2):
1446+
for j in range(2):
1447+
for k in range(2):
1448+
array1_3d[i, j, k] = (i * 4 + j * 2 + k) * 2.0
1449+
array2_3d[i, j, k] = (i * 4 + j * 2 + k) * 3.0
1450+
1451+
result_add_3d = array1_3d.add_simd(array2_3d)
1452+
result_sub_3d = array1_3d.sub_simd(array2_3d)
1453+
result_mul_3d = array1_3d.mul_simd(array2_3d)
1454+
1455+
for i in range(2):
1456+
for j in range(2):
1457+
for k in range(2):
1458+
value = i * 4 + j * 2 + k
1459+
self.assertAlmostEqual(value * 5.0,
1460+
result_add_3d[i, j, k])
1461+
self.assertAlmostEqual(value * -1.0,
1462+
result_sub_3d[i, j, k])
1463+
self.assertAlmostEqual(value * value * 6.0,
1464+
result_mul_3d[i, j, k])
1465+
12231466

12241467
class SimpleArrayCalculatorsTC(unittest.TestCase):
12251468

@@ -3154,12 +3397,6 @@ def test_alignment_with_push_back(self):
31543397
# Verify alignment is maintained
31553398
self.assertEqual(16, ct.alignment)
31563399

3157-
def test_alignment_with_simd_operations(self):
3158-
# TODO: implement tests for SIMD operations if applicable.
3159-
# It requires more effort on `as_array` and ConcreteBuffer side.
3160-
# See more: https://github.com/solvcon/modmesh/issues/620
3161-
pass
3162-
31633400
def test_alignment_preserved_in_as_array(self):
31643401
ct = modmesh.SimpleCollectorFloat64(16, 32)
31653402
self.assertEqual(32, ct.alignment)

0 commit comments

Comments
 (0)