From aa4586be15d3ef62434bd28fee827a03bd1efedd Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Thu, 6 Apr 2023 10:03:07 -0400
Subject: [PATCH] [SYCL][ESIMD] Add function to get reference to underlying
 data

This is required for inline assembly.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../sycl_ext_intel_esimd.md                   |  31 +++++
 .../ext/intel/esimd/detail/simd_obj_impl.hpp  |   9 ++
 sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp     | 108 ++++++++++++++++
 .../ESIMD/InlineAsm/asm_simd_mask.cpp         | 119 ++++++++++++++++++
 .../ESIMD/InlineAsm/asm_simd_view.cpp         | 119 ++++++++++++++++++
 sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp    | 106 ++++++++++++++++
 sycl/test/esimd/simd_inline_asm.cpp           |  30 +++++
 7 files changed, 522 insertions(+)
 create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
 create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp
 create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp
 create mode 100644 sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp
 create mode 100644 sycl/test/esimd/simd_inline_asm.cpp
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
index d2a1d24387849..fc09dd6c61552 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
+++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
@@ -815,6 +815,37 @@ simd<float, 8> __regcall SCALE(simd<float, 8> v);
 The parameter and the return type in the ABI form will be `<8 x float>`.
 <br>
 
+### Inline assembly
+Inline assembly is supported with ESIMD classes `simd`, `simd_mask` and `simd_view`. `simd_view` only supports read operations.
+In order the access the raw underlying vector required for inline assembly, the `data` function can be used for read-only access and
+the `data_ref` function can be used for write access. The `data_ref` function only exists for `simd` and `simd_mask`, and should only be used in inline assembly.
+If the `simd` or `simd_mask` object is a private global variable, the `commit` function must be called after any write in inline assembly.
+
+Example of inline GEN assembly:
+```cpp
+simd<float, 16> va;
+simd<float, 16> vb;
+simd<float, 16> vc;
+
+__asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+```
+
+Example of inline GEN assembly writing to a private global variable:
+```cpp
+ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;
+
+void calledFromKernel() {
+  simd<float, 16> va;
+  simd<float, 16> vb;
+  __asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+  vc.commit();
+}
+```
+
 ## Examples
 ### Vector addition (USM)
 ```cpp
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
index 52b86dc2f61dc..acb078b69b3a1 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
@@ -354,6 +354,15 @@ class simd_obj_impl {
 #endif
   }
 
+  /// @return A reference to the value of the
+  /// underlying raw vector. Intended for use
+  /// with l-value contexts in inline assembly.
+  raw_vector_type &data_ref() { return M_data; }
+
+  /// Commit the current stored underlying raw vector to memory.
+  /// This is required when using inline assembly with private global variables.
+  void commit() { __esimd_vstore<RawTy, N>(&M_data, M_data); }
+
   /// @return Newly constructed (from the underlying data) object of the Derived
   /// type.
   Derived read() const { return Derived{data()}; }
diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
new file mode 100644
index 0000000000000..8b01128c3744c
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
@@ -0,0 +1,108 @@
+//==---------------- asm_glb.cpp  - DPC++ ESIMD on-device test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> va;
+ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+#ifdef __SYCL_DEVICE_ONLY__
+            __asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+            vc.commit();
+#else
+                    vc = va+vb;
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp
new file mode 100644
index 0000000000000..f80ca880bb86a
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp
@@ -0,0 +1,119 @@
+//==---------------- asm_simd_mask.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 8;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+            simd<float, VL> vc;
+#ifdef __SYCL_DEVICE_ONLY__
+            simd_mask<VL> m;
+            __asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref()));
+            __asm__("{\n"
+                    ".decl P1 v_type=P num_elts=8\n"
+                    "mov (M1, 8) %0 0x1:ud\n"
+                    "setp (M1, 8) P1 %3\n"
+                    "(P1) add (M1, 8) %0 %1 %2\n"
+                    "}"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()), "rw"(m.data()));
+#else
+            simd_mask<VL> m({1,0,1,0,1,0,1,0});
+            vc = va+vb;
+            vc.merge(1, !m);
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if ((i % 2 == 0) && (A[i] + B[i] != C[i])) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    } else if ((i % 2 == 1) && (C[i] != 1)) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != 1\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp
new file mode 100644
index 0000000000000..6f536f58359aa
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_simd_view.cpp
@@ -0,0 +1,119 @@
+//==---------------- asm_simd_view.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+#ifdef __SYCL_DEVICE_ONLY__
+            auto va_half1 = va.select<VL / 2, 1>();
+            auto va_half2 = va.select<VL / 2, 1>(VL / 2);
+            auto vb_half1 = vb.select<VL / 2, 1>();
+            auto vb_half2 = vb.select<VL / 2, 1>(VL / 2);
+            simd<float, VL / 2> out1;
+            simd<float, VL / 2> out2;
+            // simd_view is not supported in l-value context in inline asm, so
+            // use simd to store the result
+            __asm__("add (M1, 8) %0 %1 %2"
+                    : "=rw"(out1.data_ref())
+                    : "rw"(va_half1.data()), "rw"(vb_half1.data()));
+            __asm__("add (M1, 8) %0 %1 %2"
+                    : "=rw"(out2.data_ref())
+                    : "rw"(va_half2.data()), "rw"(vb_half2.data()));
+            out1.copy_to(PC, offset);
+            out2.copy_to(PC, offset + ((VL / 2) * sizeof(float)));
+#else
+             simd<float, VL> vc;
+             vc = va+vb;
+             vc.copy_to(PC, offset);
+#endif
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp
new file mode 100644
index 0000000000000..1516a1d4d1017
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_vadd.cpp
@@ -0,0 +1,106 @@
+//==---------------- asm_vadd.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+            simd<float, VL> vc;
+#ifdef __SYCL_DEVICE_ONLY__
+            __asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+#else
+                    vc = va+vb;
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/sycl/test/esimd/simd_inline_asm.cpp b/sycl/test/esimd/simd_inline_asm.cpp
new file mode 100644
index 0000000000000..190a4ab2f2d69
--- /dev/null
+++ b/sycl/test/esimd/simd_inline_asm.cpp
@@ -0,0 +1,30 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s
+// Verify simd_view passed to inline asm in l-value context errors, and simd and
+// simd_mask work.
+#include <sycl/ext/intel/esimd.hpp>
+
+using namespace sycl::ext::intel::esimd;
+
+void test_error() SYCL_ESIMD_FUNCTION {
+  simd<float, 16> s;
+  simd_mask<16> mask;
+  auto view = s.select<8, 1>();
+  // expected-error@+1 {{invalid lvalue in asm output}}
+  __asm__("%0" : "=rw"(view.data()));
+
+  // expected-error@+1 {{no member named 'data_ref'}}
+  __asm__("%0" : "=rw"(view.data_ref()));
+
+  // expected-error@+1 {{invalid lvalue in asm output}}
+  __asm__("%0" : "=rw"(s.data()));
+
+  __asm__("%0" : "=rw"(s.data_ref()));
+
+  // expected-error@+1 {{invalid lvalue in asm output}}
+  __asm__("%0" : "=rw"(mask.data()));
+
+  __asm__("%0" : "=rw"(mask.data_ref()));
+
+  s.commit();
+  mask.commit();
+}