Conversation
Which exact instruction (modifiers) are you expecting? This is just LLVM not handling these orderings: source_filename = "text"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define void @kernel_monotonic(i32 addrspace(1)* %ptr) {
entry:
%0 = atomicrmw add i32 addrspace(1)* %ptr, i32 1 monotonic
ret void
}
define void @kernel_seq_cst(i32 addrspace(1)* %ptr) {
entry:
%0 = atomicrmw add i32 addrspace(1)* %ptr, i32 1seq_cst
ret void
}At the same time, NVVM (NVIDIA's closed-source back-end) doesn't seem to handle these either: #include <nvvm.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define check(ans) { _check((ans), __FILE__, __LINE__); }
void _check(nvvmResult code, const char *file, int line)
{
if (code != NVVM_SUCCESS)
{
fprintf(stderr,"NVVM error: %s at %s:%d\n", nvvmGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
FILE *f = fopen("/tmp/test.ll", "rb");
fseek(f, 0, SEEK_END);
long input_size = ftell(f);
fseek(f, 0, SEEK_SET);
char *input = malloc(input_size);
fread(input, input_size, 1, f);
fclose(f);
nvvmProgram program;
check(nvvmCreateProgram(&program));
check(nvvmAddModuleToProgram(program, input, input_size, "main"));
if (nvvmCompileProgram(program, 0, NULL) != NVVM_SUCCESS) {
size_t log_size;
check(nvvmGetProgramLogSize(program, &log_size));
char *log = malloc(log_size);
check(nvvmGetProgramLog(program, log));
fprintf(stderr, "Compilation failed: %s\n", log);
return EXIT_FAILURE;
}
size_t result_size;
check(nvvmGetCompiledResultSize(program, &result_size));
char *result = malloc(result_size);
check(nvvmGetCompiledResult(program, result));
fprintf(stdout, result, "%s\n", result);
return EXIT_SUCCESS;
} |
|
I only tried it on godbolt https://godbolt.org/z/o6areY84z but NVCC with libcu++ compiles |
5d585c4 to
c850163
Compare
I tried if a simplistic approach can solve #1353. This patch simply lets users specify atomic ordering with
Val(:monotonic)etc. passed toCUDA.atomic_*. It seems to generate the correct LLVM IR. But the orderings are not reflected to to ptx (and the run-time).prints
and
prints
i.e.,
atomic_add!with bothVal(:monotonic)andVal(:sequentially_consistent)are compiled down toatom.global.add.u64. Note that@device_code_llvmprints expected LLVM IR (i.e.,%2 = atomicrmw add i64 addrspace(1)* %1, i64 1 monotonicforVal(:monotonic)and%2 = atomicrmw add i64 addrspace(1)* %1, i64 1 seq_cstforVal(:sequentially_consistent)). Similar program on shared memory also shows that bothVal(:monotonic)andVal(:sequentially_consistent)generate the same instruction (atom.shared.add.u64).I'm not sure how to properly generate more accurate orderings as libcu++ does. Do we need to generate more ptx-specific LLVM IR?