From aa4586be15d3ef62434bd28fee827a03bd1efedd Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Thu, 6 Apr 2023 10:03:07 -0400 Subject: [PATCH] [SYCL][ESIMD] Add function to get reference to underlying data This is required for inline assembly. Signed-off-by: Sarnie, Nick --- .../sycl_ext_intel_esimd.md | 31 +++++ .../ext/intel/esimd/detail/simd_obj_impl.hpp | 9 ++ sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp | 108 ++++++++++++++++ .../ESIMD/InlineAsm/asm_simd_mask.cpp | 119 ++++++++++++++++++ .../ESIMD/InlineAsm/asm_simd_view.cpp | 119 ++++++++++++++++++ sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp | 106 ++++++++++++++++ sycl/test/esimd/simd_inline_asm.cpp | 30 +++++ 7 files changed, 522 insertions(+) create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp create mode 100644 sycl/test/esimd/simd_inline_asm.cpp diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index d2a1d24387849..fc09dd6c61552 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -815,6 +815,37 @@ simd __regcall SCALE(simd v); The parameter and the return type in the ABI form will be `<8 x float>`.
+### Inline assembly +Inline assembly is supported with ESIMD classes `simd`, `simd_mask` and `simd_view`. `simd_view` only supports read operations. +In order the access the raw underlying vector required for inline assembly, the `data` function can be used for read-only access and +the `data_ref` function can be used for write access. The `data_ref` function only exists for `simd` and `simd_mask`, and should only be used in inline assembly. +If the `simd` or `simd_mask` object is a private global variable, the `commit` function must be called after any write in inline assembly. + +Example of inline GEN assembly: +```cpp +simd va; +simd vb; +simd vc; + +__asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); +``` + +Example of inline GEN assembly writing to a private global variable: +```cpp +ESIMD_PRIVATE ESIMD_REGISTER(0) simd vc; + +void calledFromKernel() { + simd va; + simd vb; + __asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); + vc.commit(); +} +``` + ## Examples ### Vector addition (USM) ```cpp diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp index 52b86dc2f61dc..acb078b69b3a1 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp @@ -354,6 +354,15 @@ class simd_obj_impl { #endif } + /// @return A reference to the value of the + /// underlying raw vector. Intended for use + /// with l-value contexts in inline assembly. + raw_vector_type &data_ref() { return M_data; } + + /// Commit the current stored underlying raw vector to memory. + /// This is required when using inline assembly with private global variables. + void commit() { __esimd_vstore(&M_data, M_data); } + /// @return Newly constructed (from the underlying data) object of the Derived /// type. Derived read() const { return Derived{data()}; } diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp new file mode 100644 index 0000000000000..8b01128c3744c --- /dev/null +++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp @@ -0,0 +1,108 @@ +//==---------------- asm_glb.cpp - DPC++ ESIMD on-device test -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +ESIMD_PRIVATE ESIMD_REGISTER(0) simd va; +ESIMD_PRIVATE ESIMD_REGISTER(0) simd vc; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); +#ifdef __SYCL_DEVICE_ONLY__ + __asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); + vc.commit(); +#else + vc = va+vb; +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp new file mode 100644 index 0000000000000..f80ca880bb86a --- /dev/null +++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp @@ -0,0 +1,119 @@ +//==---------------- asm_simd_mask.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 8; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); + simd vc; +#ifdef __SYCL_DEVICE_ONLY__ + simd_mask m; + __asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref())); + __asm__("{\n" + ".decl P1 v_type=P num_elts=8\n" + "mov (M1, 8) %0 0x1:ud\n" + "setp (M1, 8) P1 %3\n" + "(P1) add (M1, 8) %0 %1 %2\n" + "}" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data()), "rw"(m.data())); +#else + simd_mask m({1,0,1,0,1,0,1,0}); + vc = va+vb; + vc.merge(1, !m); +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if ((i % 2 == 0) && (A[i] + B[i] != C[i])) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } else if ((i % 2 == 1) && (C[i] != 1)) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != 1\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp new file mode 100644 index 0000000000000..6f536f58359aa --- /dev/null +++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp @@ -0,0 +1,119 @@ +//==---------------- asm_simd_view.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); +#ifdef __SYCL_DEVICE_ONLY__ + auto va_half1 = va.select(); + auto va_half2 = va.select(VL / 2); + auto vb_half1 = vb.select(); + auto vb_half2 = vb.select(VL / 2); + simd out1; + simd out2; + // simd_view is not supported in l-value context in inline asm, so + // use simd to store the result + __asm__("add (M1, 8) %0 %1 %2" + : "=rw"(out1.data_ref()) + : "rw"(va_half1.data()), "rw"(vb_half1.data())); + __asm__("add (M1, 8) %0 %1 %2" + : "=rw"(out2.data_ref()) + : "rw"(va_half2.data()), "rw"(vb_half2.data())); + out1.copy_to(PC, offset); + out2.copy_to(PC, offset + ((VL / 2) * sizeof(float))); +#else + simd vc; + vc = va+vb; + vc.copy_to(PC, offset); +#endif + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp new file mode 100644 index 0000000000000..1516a1d4d1017 --- /dev/null +++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp @@ -0,0 +1,106 @@ +//==---------------- asm_vadd.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); + simd vc; +#ifdef __SYCL_DEVICE_ONLY__ + __asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); +#else + vc = va+vb; +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/sycl/test/esimd/simd_inline_asm.cpp b/sycl/test/esimd/simd_inline_asm.cpp new file mode 100644 index 0000000000000..190a4ab2f2d69 --- /dev/null +++ b/sycl/test/esimd/simd_inline_asm.cpp @@ -0,0 +1,30 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s +// Verify simd_view passed to inline asm in l-value context errors, and simd and +// simd_mask work. +#include + +using namespace sycl::ext::intel::esimd; + +void test_error() SYCL_ESIMD_FUNCTION { + simd s; + simd_mask<16> mask; + auto view = s.select<8, 1>(); + // expected-error@+1 {{invalid lvalue in asm output}} + __asm__("%0" : "=rw"(view.data())); + + // expected-error@+1 {{no member named 'data_ref'}} + __asm__("%0" : "=rw"(view.data_ref())); + + // expected-error@+1 {{invalid lvalue in asm output}} + __asm__("%0" : "=rw"(s.data())); + + __asm__("%0" : "=rw"(s.data_ref())); + + // expected-error@+1 {{invalid lvalue in asm output}} + __asm__("%0" : "=rw"(mask.data())); + + __asm__("%0" : "=rw"(mask.data_ref())); + + s.commit(); + mask.commit(); +}