diff --git a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h new file mode 100644 index 0000000000000..7d76c1f4e5218 --- /dev/null +++ b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h @@ -0,0 +1,28 @@ +//===------- Optimizer/Transforms/CUFGPUToLLVMConversion.h ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_ +#define FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_ + +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace fir { +class LLVMTypeConverter; +} + +namespace cuf { + +void populateCUFGPUToLLVMConversionPatterns( + const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns, + mlir::PatternBenefit benefit = 1); + +} // namespace cuf + +#endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_ diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 5d3067aa35981..e8f0a8444a31a 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -41,6 +41,7 @@ namespace fir { #define GEN_PASS_DECL_CFGCONVERSION #define GEN_PASS_DECL_CUFADDCONSTRUCTOR #define GEN_PASS_DECL_CUFDEVICEGLOBAL +#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION #define GEN_PASS_DECL_CUFOPCONVERSION #define GEN_PASS_DECL_EXTERNALNAMECONVERSION #define GEN_PASS_DECL_MEMREFDATAFLOWOPT diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 2efa543ca0714..a41f0f348f27a 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -443,4 +443,11 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> { ]; } +def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> { + let summary = "Convert some GPU operations lowered from CUF to runtime calls"; + let dependentDialects = [ + "mlir::LLVM::LLVMDialect" + ]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 8f4f731e00922..d20d3bc4108ce 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -12,6 +12,7 @@ add_flang_library(FIRTransforms CUFAddConstructor.cpp CUFDeviceGlobal.cpp CUFOpConversion.cpp + CUFGPUToLLVMConversion.cpp ArrayValueCopy.cpp ExternalNameConversion.cpp MemoryUtils.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp new file mode 100644 index 0000000000000..5645ce6e6858c --- /dev/null +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -0,0 +1,180 @@ +//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h" +#include "flang/Common/Fortran.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" +#include "flang/Optimizer/Support/DataLayout.h" +#include "flang/Runtime/CUDA/common.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/Support/FormatVariadic.h" + +namespace fir { +#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +using namespace fir; +using namespace mlir; +using namespace Fortran::runtime; + +namespace { + +static mlir::Value createKernelArgArray(mlir::Location loc, + mlir::ValueRange operands, + mlir::PatternRewriter &rewriter) { + + auto *ctx = rewriter.getContext(); + llvm::SmallVector structTypes(operands.size(), nullptr); + + for (auto [i, arg] : llvm::enumerate(operands)) + structTypes[i] = arg.getType(); + + auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes); + auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext()); + mlir::Type i32Ty = rewriter.getI32Type(); + auto one = rewriter.create( + loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1)); + mlir::Value argStruct = + rewriter.create(loc, ptrTy, structTy, one); + auto size = rewriter.create( + loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size())); + mlir::Value argArray = + rewriter.create(loc, ptrTy, ptrTy, size); + + for (auto [i, arg] : llvm::enumerate(operands)) { + auto indice = rewriter.create( + loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i)); + mlir::Value structMember = rewriter.create( + loc, ptrTy, structTy, argStruct, mlir::ArrayRef({indice})); + rewriter.create(loc, arg, structMember); + mlir::Value arrayMember = rewriter.create( + loc, ptrTy, structTy, argArray, mlir::ArrayRef({indice})); + rewriter.create(loc, structMember, arrayMember); + } + return argArray; +} + +struct GPULaunchKernelConversion + : public mlir::ConvertOpToLLVMPattern { + explicit GPULaunchKernelConversion( + const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit) + : mlir::ConvertOpToLLVMPattern(typeConverter, + benefit) {} + + using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor; + + mlir::LogicalResult + matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + + if (op.hasClusterSize()) { + return mlir::failure(); + } + + mlir::Location loc = op.getLoc(); + auto *ctx = rewriter.getContext(); + mlir::ModuleOp mod = op->getParentOfType(); + mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize(); + mlir::Type i32Ty = rewriter.getI32Type(); + if (!dynamicMemorySize) + dynamicMemorySize = rewriter.create( + loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0)); + + mlir::Value kernelArgs = + createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter); + + auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext()); + auto kernel = mod.lookupSymbol(op.getKernelName()); + mlir::Value kernelPtr; + if (!kernel) { + auto funcOp = mod.lookupSymbol(op.getKernelName()); + if (!funcOp) + return mlir::failure(); + kernelPtr = + rewriter.create(loc, ptrTy, funcOp.getName()); + } else { + kernelPtr = + rewriter.create(loc, ptrTy, kernel.getName()); + } + + auto funcOp = mod.lookupSymbol( + RTNAME_STRING(CUFLaunchKernel)); + + auto llvmIntPtrType = mlir::IntegerType::get( + ctx, this->getTypeConverter()->getPointerBitwidth(0)); + auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx); + auto funcTy = mlir::LLVM::LLVMFunctionType::get( + voidTy, + {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, + llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, + /*isVarArg=*/false); + + auto cufLaunchKernel = mlir::SymbolRefAttr::get( + mod.getContext(), RTNAME_STRING(CUFLaunchKernel)); + if (!funcOp) { + mlir::OpBuilder::InsertionGuard insertGuard(rewriter); + rewriter.setInsertionPointToStart(mod.getBody()); + auto launchKernelFuncOp = rewriter.create( + loc, RTNAME_STRING(CUFLaunchKernel), funcTy); + launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private); + } + + mlir::Value nullPtr = rewriter.create(loc, ptrTy); + + rewriter.replaceOpWithNewOp( + op, funcTy, cufLaunchKernel, + mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(), + adaptor.getGridSizeY(), adaptor.getGridSizeZ(), + adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), + adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs, + nullPtr}); + + return mlir::success(); + } +}; + +class CUFGPUToLLVMConversion + : public fir::impl::CUFGPUToLLVMConversionBase { +public: + void runOnOperation() override { + auto *ctx = &getContext(); + mlir::RewritePatternSet patterns(ctx); + mlir::ConversionTarget target(*ctx); + + mlir::Operation *op = getOperation(); + mlir::ModuleOp module = mlir::dyn_cast(op); + if (!module) + return signalPassFailure(); + + std::optional dl = + fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false); + fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, + /*forceUnifiedTBAATree=*/false, *dl); + cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns); + target.addIllegalOp(); + target.addLegalDialect(); + if (mlir::failed(mlir::applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + mlir::emitError(mlir::UnknownLoc::get(ctx), + "error in CUF GPU op conversion\n"); + signalPassFailure(); + } + } +}; +} // namespace + +void cuf::populateCUFGPUToLLVMConversionPatterns( + const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns, + mlir::PatternBenefit benefit) { + patterns.add(converter, benefit); +} diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 069d88e0afca4..9c2b882c7f46f 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -20,6 +20,7 @@ #include "flang/Runtime/CUDA/descriptor.h" #include "flang/Runtime/CUDA/memory.h" #include "flang/Runtime/allocatable.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" diff --git a/flang/runtime/CUDA/registration.cpp b/flang/runtime/CUDA/registration.cpp index 22d43a7dc57a3..20d274c4d8d1c 100644 --- a/flang/runtime/CUDA/registration.cpp +++ b/flang/runtime/CUDA/registration.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/registration.h" +#include "../terminator.h" +#include "flang/Runtime/CUDA/common.h" #include "cuda_runtime.h" @@ -31,5 +33,7 @@ void RTDEF(CUFRegisterFunction)( __cudaRegisterFunction(module, fctSym, fctName, fctName, -1, (uint3 *)0, (uint3 *)0, (dim3 *)0, (dim3 *)0, (int *)0); } -} + +} // extern "C" + } // namespace Fortran::runtime::cuda diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir new file mode 100644 index 0000000000000..f10bd82f978dc --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -0,0 +1,104 @@ +// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + llvm.func @_QMmod1Phost_sub() { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(40 : i64) : i64 + %3 = llvm.mlir.constant(16 : i32) : i32 + %4 = llvm.mlir.constant(25 : i32) : i32 + %5 = llvm.mlir.constant(21 : i32) : i32 + %6 = llvm.mlir.constant(17 : i32) : i32 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(27 : i32) : i32 + %9 = llvm.mlir.constant(6 : i32) : i32 + %10 = llvm.mlir.constant(1 : i32) : i32 + %11 = llvm.mlir.constant(0 : i32) : i32 + %12 = llvm.mlir.constant(10 : index) : i64 + %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr + %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr + %15 = llvm.mlir.constant(10 : index) : i64 + %16 = llvm.mlir.constant(1 : index) : i64 + %17 = llvm.alloca %15 x i32 : (i64) -> !llvm.ptr + %18 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %19 = llvm.insertvalue %17, %18[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %20 = llvm.insertvalue %17, %19[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %21 = llvm.mlir.constant(0 : index) : i64 + %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %23 = llvm.insertvalue %15, %22[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %24 = llvm.insertvalue %16, %23[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %25 = llvm.extractvalue %24[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %26 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %29 = llvm.mlir.constant(0 : index) : i64 + %30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %31 = llvm.mlir.constant(10 : index) : i64 + %32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %33 = llvm.mlir.constant(1 : index) : i64 + %34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.mlir.constant(11 : index) : i64 + %37 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%38: i64): // 2 preds: ^bb0, ^bb2 + %39 = llvm.icmp "slt" %38, %36 : i64 + llvm.cond_br %39, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %40 = llvm.mlir.constant(-1 : index) : i64 + %41 = llvm.add %38, %40 : i64 + %42 = llvm.extractvalue %34[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %43 = llvm.getelementptr %42[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + llvm.store %11, %43 : i32, !llvm.ptr + %44 = llvm.add %38, %37 : i64 + llvm.br ^bb1(%44 : i64) + ^bb3: // pred: ^bb1 + %45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) + %46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> + %47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr + %48 = llvm.mlir.constant(9 : i32) : i32 + %49 = llvm.mlir.zero : !llvm.ptr + %50 = llvm.getelementptr %49[1] : (!llvm.ptr) -> !llvm.ptr, i32 + %51 = llvm.ptrtoint %50 : !llvm.ptr to i64 + %52 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %53 = llvm.insertvalue %51, %52[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %54 = llvm.mlir.constant(20240719 : i32) : i32 + %55 = llvm.insertvalue %54, %53[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %56 = llvm.mlir.constant(1 : i32) : i32 + %57 = llvm.trunc %56 : i32 to i8 + %58 = llvm.insertvalue %57, %55[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %59 = llvm.trunc %48 : i32 to i8 + %60 = llvm.insertvalue %59, %58[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %61 = llvm.mlir.constant(0 : i32) : i32 + %62 = llvm.trunc %61 : i32 to i8 + %63 = llvm.insertvalue %62, %60[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %64 = llvm.mlir.constant(0 : i32) : i32 + %65 = llvm.trunc %64 : i32 to i8 + %66 = llvm.insertvalue %65, %63[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %67 = llvm.mlir.constant(0 : i64) : i64 + %68 = llvm.mlir.constant(1 : i64) : i64 + %69 = llvm.insertvalue %68, %66[7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %70 = llvm.insertvalue %12, %69[7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %71 = llvm.insertvalue %51, %70[7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %72 = llvm.mul %51, %12 : i64 + %73 = llvm.insertvalue %25, %71[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %73, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr + llvm.return + } + llvm.func @_QMmod1Psub1(!llvm.ptr) -> () + llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> { + %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8> + llvm.return %0 : !llvm.array<2 x i8> + } + llvm.func @_FortranAioBeginExternalListOutput(i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.io, fir.runtime, sym_visibility = "private"} + llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"} + llvm.func @_FortranACUFDataTransferPtrPtr(!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"} + llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"} + gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] +} + +// CHECK-LABEL: _QMmod1Phost_sub + +// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr +// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}}) diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index 84a74770cf030..5f6a856116bc0 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -43,6 +43,7 @@ int main(int argc, char **argv) { DialectRegistry registry; fir::support::registerDialects(registry); registry.insert(); + registry.insert(); fir::support::addFIRExtensions(registry); return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n", registry));