Skip to content

[SYCL][ESIMD] Add function to get reference to underlying data #8725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,37 @@ simd<float, 8> __regcall SCALE(simd<float, 8> v);
The parameter and the return type in the ABI form will be `<8 x float>`.
<br>

### Inline assembly
Inline assembly is supported with ESIMD classes `simd`, `simd_mask` and `simd_view`. `simd_view` only supports read operations.
In order the access the raw underlying vector required for inline assembly, the `data` function can be used for read-only access and
the `data_ref` function can be used for write access. The `data_ref` function only exists for `simd` and `simd_mask`, and should only be used in inline assembly.
If the `simd` or `simd_mask` object is a private global variable, the `commit` function must be called after any write in inline assembly.

Example of inline GEN assembly:
```cpp
simd<float, 16> va;
simd<float, 16> vb;
simd<float, 16> vc;

__asm__("add (M1, 16) %0 %1 %2"
: "=rw"(vc.data_ref())
: "rw"(va.data()), "rw"(vb.data()));
```

Example of inline GEN assembly writing to a private global variable:
```cpp
ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;

void calledFromKernel() {
simd<float, 16> va;
simd<float, 16> vb;
__asm__("add (M1, 16) %0 %1 %2"
: "=rw"(vc.data_ref())
: "rw"(va.data()), "rw"(vb.data()));
vc.commit();
}
```

## Examples
### Vector addition (USM)
```cpp
Expand Down
9 changes: 9 additions & 0 deletions sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,15 @@ class simd_obj_impl {
#endif
}

/// @return A reference to the value of the
/// underlying raw vector. Intended for use
/// with l-value contexts in inline assembly.
raw_vector_type &data_ref() { return M_data; }

/// Commit the current stored underlying raw vector to memory.
/// This is required when using inline assembly with private global variables.
void commit() { __esimd_vstore<RawTy, N>(&M_data, M_data); }

/// @return Newly constructed (from the underlying data) object of the Derived
/// type.
Derived read() const { return Derived{data()}; }
Expand Down
108 changes: 108 additions & 0 deletions sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//==---------------- asm_glb.cpp - DPC++ ESIMD on-device test -------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: gpu
// UNSUPPORTED: cuda || hip
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

#include "../esimd_test_utils.hpp"

#include <iostream>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

using namespace sycl;
using namespace sycl::ext::intel::esimd;

ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> va;
ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;

int main(void) {
constexpr unsigned Size = 1024 * 128;
constexpr unsigned VL = 16;

float *A = new float[Size];
float *B = new float[Size];
float *C = new float[Size];

for (unsigned i = 0; i < Size; ++i) {
A[i] = B[i] = i;
C[i] = 0.0f;
}

try {
buffer<float, 1> bufa(A, range<1>(Size));
buffer<float, 1> bufb(B, range<1>(Size));
buffer<float, 1> bufc(C, range<1>(Size));

// We need that many workgroups
range<1> GlobalRange{Size / VL};

// We need that many threads in each group
range<1> LocalRange{1};

queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());

auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";

auto e = q.submit([&](handler &cgh) {
auto PA = bufa.get_access<access::mode::read>(cgh);
auto PB = bufb.get_access<access::mode::read>(cgh);
auto PC = bufc.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Test>(
GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
unsigned int offset = i * VL * sizeof(float);
va.copy_from(PA, offset);
simd<float, VL> vb;
vb.copy_from(PB, offset);
#ifdef __SYCL_DEVICE_ONLY__
__asm__("add (M1, 16) %0 %1 %2"
: "=rw"(vc.data_ref())
: "rw"(va.data()), "rw"(vb.data()));
vc.commit();
#else
vc = va+vb;
#endif
vc.copy_to(PC, offset);
});
});
e.wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';

delete[] A;
delete[] B;
delete[] C;
return 1;
}

int err_cnt = 0;

for (unsigned i = 0; i < Size; ++i) {
if (A[i] + B[i] != C[i]) {
if (++err_cnt < 10) {
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
<< " + " << B[i] << "\n";
}
}
}
if (err_cnt > 0) {
std::cout << " pass rate: "
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
<< (Size - err_cnt) << "/" << Size << ")\n";
}

delete[] A;
delete[] B;
delete[] C;

std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
return err_cnt > 0 ? 1 : 0;
}
119 changes: 119 additions & 0 deletions sycl/test-e2e/ESIMD/InlineAsm/asm_simd_mask.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
//==---------------- asm_simd_mask.cpp - DPC++ ESIMD on-device test
//-------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: gpu
// UNSUPPORTED: cuda || hip
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

#include "../esimd_test_utils.hpp"

#include <iostream>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

using namespace sycl;

int main(void) {
constexpr unsigned Size = 1024 * 128;
constexpr unsigned VL = 8;

float *A = new float[Size];
float *B = new float[Size];
float *C = new float[Size];

for (unsigned i = 0; i < Size; ++i) {
A[i] = B[i] = i;
C[i] = 0.0f;
}

try {
buffer<float, 1> bufa(A, range<1>(Size));
buffer<float, 1> bufb(B, range<1>(Size));
buffer<float, 1> bufc(C, range<1>(Size));

// We need that many workgroups
range<1> GlobalRange{Size / VL};

// We need that many threads in each group
range<1> LocalRange{1};

queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());

auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";

auto e = q.submit([&](handler &cgh) {
auto PA = bufa.get_access<access::mode::read>(cgh);
auto PB = bufb.get_access<access::mode::read>(cgh);
auto PC = bufc.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Test>(
GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
unsigned int offset = i * VL * sizeof(float);
simd<float, VL> va;
va.copy_from(PA, offset);
simd<float, VL> vb;
vb.copy_from(PB, offset);
simd<float, VL> vc;
#ifdef __SYCL_DEVICE_ONLY__
simd_mask<VL> m;
__asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref()));
__asm__("{\n"
".decl P1 v_type=P num_elts=8\n"
"mov (M1, 8) %0 0x1:ud\n"
"setp (M1, 8) P1 %3\n"
"(P1) add (M1, 8) %0 %1 %2\n"
"}"
: "=rw"(vc.data_ref())
: "rw"(va.data()), "rw"(vb.data()), "rw"(m.data()));
#else
simd_mask<VL> m({1,0,1,0,1,0,1,0});
vc = va+vb;
vc.merge(1, !m);
#endif
vc.copy_to(PC, offset);
});
});
e.wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';

delete[] A;
delete[] B;
delete[] C;
return 1;
}

int err_cnt = 0;

for (unsigned i = 0; i < Size; ++i) {
if ((i % 2 == 0) && (A[i] + B[i] != C[i])) {
if (++err_cnt < 10) {
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
<< " + " << B[i] << "\n";
}
} else if ((i % 2 == 1) && (C[i] != 1)) {
if (++err_cnt < 10) {
std::cout << "failed at index " << i << ", " << C[i] << " != 1\n";
}
}
}
if (err_cnt > 0) {
std::cout << " pass rate: "
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
<< (Size - err_cnt) << "/" << Size << ")\n";
}

delete[] A;
delete[] B;
delete[] C;

std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
return err_cnt > 0 ? 1 : 0;
}
Loading