mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-17 00:36:34 +00:00
[flang][cuda] Sync double descriptor after c_f_pointer call (#130194)
After a global device pointer is set through `c_f_pointer`, we need to sync the double descriptor so the version on the device is also up to date.
This commit is contained in:
parent
55f86cf023
commit
478e516140
@ -20,27 +20,6 @@
|
||||
#include "mlir/Dialect/OpenACC/OpenACC.h"
|
||||
|
||||
namespace Fortran::lower {
|
||||
// Check if the insertion point is currently in a device context. HostDevice
|
||||
// subprogram are not considered fully device context so it will return false
|
||||
// for it.
|
||||
// If the insertion point is inside an OpenACC region op, it is considered
|
||||
// device context.
|
||||
static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) {
|
||||
if (builder.getRegion().getParentOfType<cuf::KernelOp>())
|
||||
return true;
|
||||
if (builder.getRegion()
|
||||
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
|
||||
return true;
|
||||
if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
|
||||
if (auto cudaProcAttr =
|
||||
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
|
||||
cuf::getProcAttrName())) {
|
||||
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
|
||||
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
|
||||
std::optional<Fortran::common::CUDADataAttr> cudaAttr =
|
||||
|
@ -25,8 +25,10 @@ namespace cuf {
|
||||
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
|
||||
mlir::SymbolTable &symTab);
|
||||
|
||||
bool isInCUDADeviceContext(mlir::Operation *op);
|
||||
bool isCUDADeviceContext(mlir::Operation *op);
|
||||
bool isCUDADeviceContext(mlir::Region &);
|
||||
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
|
||||
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
|
||||
|
||||
void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
|
||||
|
||||
|
@ -0,0 +1,31 @@
|
||||
//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
|
||||
#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
|
||||
|
||||
#include "mlir/IR/Value.h"
|
||||
|
||||
namespace mlir {
|
||||
class Location;
|
||||
} // namespace mlir
|
||||
|
||||
namespace fir {
|
||||
class FirOpBuilder;
|
||||
}
|
||||
|
||||
namespace fir::runtime::cuda {
|
||||
|
||||
/// Generate runtime call to sync the doublce descriptor referenced by
|
||||
/// \p hostPtr.
|
||||
void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
|
||||
mlir::Value hostPtr);
|
||||
|
||||
} // namespace fir::runtime::cuda
|
||||
|
||||
#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
|
@ -470,7 +470,7 @@ private:
|
||||
void genSimpleAllocation(const Allocation &alloc,
|
||||
const fir::MutableBoxValue &box) {
|
||||
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
|
||||
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
|
||||
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
|
||||
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
|
||||
!alloc.type.IsPolymorphic() &&
|
||||
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
|
||||
@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder,
|
||||
mlir::Value declaredTypeDesc = {},
|
||||
const Fortran::semantics::Symbol *symbol = nullptr) {
|
||||
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
|
||||
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
|
||||
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
|
||||
bool inlineDeallocation =
|
||||
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
|
||||
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
|
||||
|
@ -4689,7 +4689,7 @@ private:
|
||||
mlir::Location loc = getCurrentLocation();
|
||||
fir::FirOpBuilder &builder = getFirOpBuilder();
|
||||
|
||||
bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
|
||||
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
|
||||
|
||||
bool isCUDATransfer =
|
||||
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
|
||||
|
@ -18,6 +18,7 @@ add_flang_library(FIRBuilder
|
||||
Runtime/Assign.cpp
|
||||
Runtime/Character.cpp
|
||||
Runtime/Command.cpp
|
||||
Runtime/CUDA/Descriptor.cpp
|
||||
Runtime/Derived.cpp
|
||||
Runtime/EnvironmentDefaults.cpp
|
||||
Runtime/Exceptions.cpp
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "flang/Optimizer/HLFIR/HLFIROps.h"
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
|
||||
#include "mlir/Dialect/OpenACC/OpenACC.h"
|
||||
|
||||
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
|
||||
mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
|
||||
@ -31,30 +32,45 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
|
||||
return gpuMod;
|
||||
}
|
||||
|
||||
bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
|
||||
if (!op)
|
||||
bool cuf::isCUDADeviceContext(mlir::Operation *op) {
|
||||
if (!op || !op->getParentRegion())
|
||||
return false;
|
||||
if (op->getParentOfType<cuf::KernelOp>() ||
|
||||
op->getParentOfType<mlir::gpu::GPUFuncOp>())
|
||||
return isCUDADeviceContext(*op->getParentRegion());
|
||||
}
|
||||
|
||||
// Check if the insertion point is currently in a device context. HostDevice
|
||||
// subprogram are not considered fully device context so it will return false
|
||||
// for it.
|
||||
// If the insertion point is inside an OpenACC region op, it is considered
|
||||
// device context.
|
||||
bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
|
||||
if (region.getParentOfType<cuf::KernelOp>())
|
||||
return true;
|
||||
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
|
||||
if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
|
||||
cuf::getProcAttrName())) {
|
||||
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
|
||||
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
|
||||
return true;
|
||||
if (auto funcOp = region.getParentOfType<mlir::func::FuncOp>()) {
|
||||
if (auto cudaProcAttr =
|
||||
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
|
||||
cuf::getProcAttrName())) {
|
||||
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
|
||||
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cuf::isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr) {
|
||||
if (attr && (*attr == cuf::DataAttribute::Device ||
|
||||
*attr == cuf::DataAttribute::Managed ||
|
||||
*attr == cuf::DataAttribute::Constant))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
|
||||
if (op.getConstant())
|
||||
return false;
|
||||
auto attr = op.getDataAttr();
|
||||
if (attr && (*attr == cuf::DataAttribute::Device ||
|
||||
*attr == cuf::DataAttribute::Managed ||
|
||||
*attr == cuf::DataAttribute::Constant))
|
||||
return true;
|
||||
return false;
|
||||
return isRegisteredDeviceAttr(op.getDataAttr());
|
||||
}
|
||||
|
||||
void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
|
||||
|
@ -16,12 +16,14 @@
|
||||
#include "flang/Optimizer/Builder/IntrinsicCall.h"
|
||||
#include "flang/Common/static-multimap-view.h"
|
||||
#include "flang/Optimizer/Builder/BoxValue.h"
|
||||
#include "flang/Optimizer/Builder/CUFCommon.h"
|
||||
#include "flang/Optimizer/Builder/Character.h"
|
||||
#include "flang/Optimizer/Builder/Complex.h"
|
||||
#include "flang/Optimizer/Builder/FIRBuilder.h"
|
||||
#include "flang/Optimizer/Builder/MutableBox.h"
|
||||
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/Allocatable.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/Character.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/Command.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/Derived.h"
|
||||
@ -38,6 +40,7 @@
|
||||
#include "flang/Optimizer/Dialect/FIROps.h"
|
||||
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
|
||||
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
|
||||
#include "flang/Optimizer/HLFIR/HLFIROps.h"
|
||||
#include "flang/Optimizer/Support/FatalError.h"
|
||||
#include "flang/Optimizer/Support/Utils.h"
|
||||
#include "flang/Runtime/entry-names.h"
|
||||
@ -3254,6 +3257,17 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
|
||||
fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr),
|
||||
/*lbounds=*/mlir::ValueRange{});
|
||||
|
||||
// If the pointer is a registered CUDA fortran variable, the descriptor needs
|
||||
// to be synced.
|
||||
if (auto declare = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
|
||||
fPtr->getAddr().getDefiningOp()))
|
||||
if (declare.getMemref().getDefiningOp() &&
|
||||
mlir::isa<fir::AddrOfOp>(declare.getMemref().getDefiningOp()))
|
||||
if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) &&
|
||||
!cuf::isCUDADeviceContext(builder.getRegion()))
|
||||
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc,
|
||||
declare.getMemref());
|
||||
}
|
||||
|
||||
// C_F_PROCPOINTER
|
||||
|
34
flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
Normal file
34
flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
|
||||
#include "flang/Optimizer/Builder/FIRBuilder.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
|
||||
#include "flang/Runtime/CUDA/descriptor.h"
|
||||
|
||||
using namespace Fortran::runtime::cuda;
|
||||
|
||||
void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
|
||||
mlir::Location loc,
|
||||
mlir::Value hostPtr) {
|
||||
mlir::func::FuncOp callee =
|
||||
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
|
||||
builder);
|
||||
auto fTy = callee.getFunctionType();
|
||||
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
|
||||
mlir::Value sourceLine =
|
||||
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
|
||||
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
||||
builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
|
||||
builder.create<fir::CallOp>(loc, callee, args);
|
||||
}
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "flang/Optimizer/Transforms/CUFOpConversion.h"
|
||||
#include "flang/Optimizer/Builder/CUFCommon.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
|
||||
#include "flang/Optimizer/CodeGen/TypeConverter.h"
|
||||
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
||||
@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion
|
||||
|
||||
auto hostAddr = builder.create<fir::AddrOfOp>(
|
||||
loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
|
||||
mlir::func::FuncOp callee =
|
||||
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
|
||||
builder);
|
||||
auto fTy = callee.getFunctionType();
|
||||
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
|
||||
mlir::Value sourceLine =
|
||||
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
|
||||
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
||||
builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
|
||||
builder.create<fir::CallOp>(loc, callee, args);
|
||||
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
|
||||
op.erase();
|
||||
return mlir::success();
|
||||
}
|
||||
|
@ -1279,7 +1279,7 @@ void SimplifyIntrinsicsPass::runOnOperation() {
|
||||
fir::KindMapping kindMap = fir::getKindMapping(module);
|
||||
module.walk([&](mlir::Operation *op) {
|
||||
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
|
||||
if (cuf::isInCUDADeviceContext(op))
|
||||
if (cuf::isCUDADeviceContext(op))
|
||||
return;
|
||||
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
|
||||
mlir::StringRef funcName = callee.getLeafReference().getValue();
|
||||
|
@ -2,10 +2,31 @@
|
||||
|
||||
! Test lowering of CUDA pointers.
|
||||
|
||||
module mod1
|
||||
|
||||
integer, device, pointer :: x(:)
|
||||
|
||||
contains
|
||||
|
||||
subroutine allocate_pointer
|
||||
real, device, pointer :: pr(:)
|
||||
allocate(pr(10))
|
||||
end
|
||||
|
||||
! CHECK-LABEL: func.func @_QPallocate_pointer()
|
||||
! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer()
|
||||
! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
|
||||
|
||||
subroutine c_f_pointer_sync
|
||||
use iso_c_binding
|
||||
use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
|
||||
type(c_devptr) :: cd1
|
||||
integer, parameter :: N = 2000
|
||||
call c_f_pointer(cd1, x, (/ 2000 /))
|
||||
end
|
||||
|
||||
! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync()
|
||||
! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
|
||||
! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<i8>
|
||||
! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> ()
|
||||
|
||||
end module
|
||||
|
Loading…
x
Reference in New Issue
Block a user