[flang][cuda] Sync double descriptor after c_f_pointer call (#130194)

After a global device pointer is set through `c_f_pointer`, we need to
sync the double descriptor so the version on the device is also up to
date.
This commit is contained in:
Valentin Clement (バレンタイン クレメン) 2025-03-06 19:19:51 -08:00 committed by GitHub
parent 55f86cf023
commit 478e516140
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 141 additions and 51 deletions

View File

@ -20,27 +20,6 @@
#include "mlir/Dialect/OpenACC/OpenACC.h"
namespace Fortran::lower {
// Check if the insertion point is currently in a device context. HostDevice
// subprogram are not considered fully device context so it will return false
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
static bool inline isCudaDeviceContext(fir::FirOpBuilder &builder) {
if (builder.getRegion().getParentOfType<cuf::KernelOp>())
return true;
if (builder.getRegion()
.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
if (auto funcOp = builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}
static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
std::optional<Fortran::common::CUDADataAttr> cudaAttr =

View File

@ -25,8 +25,10 @@ namespace cuf {
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
bool isInCUDADeviceContext(mlir::Operation *op);
bool isCUDADeviceContext(mlir::Operation *op);
bool isCUDADeviceContext(mlir::Region &);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);

View File

@ -0,0 +1,31 @@
//===-- Descriptor.h - CUDA descritpor runtime API calls --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
#include "mlir/IR/Value.h"
namespace mlir {
class Location;
} // namespace mlir
namespace fir {
class FirOpBuilder;
}
namespace fir::runtime::cuda {
/// Generate runtime call to sync the doublce descriptor referenced by
/// \p hostPtr.
void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value hostPtr);
} // namespace fir::runtime::cuda
#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_

View File

@ -470,7 +470,7 @@ private:
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() &&
!alloc.hasCoarraySpec() && !useAllocateRuntime &&
@ -862,7 +862,7 @@ genDeallocate(fir::FirOpBuilder &builder,
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCudaDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool inlineDeallocation =
!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&

View File

@ -4689,7 +4689,7 @@ private:
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;

View File

@ -18,6 +18,7 @@ add_flang_library(FIRBuilder
Runtime/Assign.cpp
Runtime/Character.cpp
Runtime/Command.cpp
Runtime/CUDA/Descriptor.cpp
Runtime/Derived.cpp
Runtime/EnvironmentDefaults.cpp
Runtime/Exceptions.cpp

View File

@ -12,6 +12,7 @@
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
@ -31,30 +32,45 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
return gpuMod;
}
bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
if (!op)
bool cuf::isCUDADeviceContext(mlir::Operation *op) {
if (!op || !op->getParentRegion())
return false;
if (op->getParentOfType<cuf::KernelOp>() ||
op->getParentOfType<mlir::gpu::GPUFuncOp>())
return isCUDADeviceContext(*op->getParentRegion());
}
// Check if the insertion point is currently in a device context. HostDevice
// subprogram are not considered fully device context so it will return false
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
bool cuf::isCUDADeviceContext(mlir::Region &region) {
if (region.getParentOfType<cuf::KernelOp>())
return true;
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
return true;
if (auto funcOp = region.getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName())) {
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
return false;
}
bool cuf::isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr) {
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
return true;
return false;
}
bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
if (op.getConstant())
return false;
auto attr = op.getDataAttr();
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
return true;
return false;
return isRegisteredDeviceAttr(op.getDataAttr());
}
void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {

View File

@ -16,12 +16,14 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Common/static-multimap-view.h"
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Character.h"
#include "flang/Optimizer/Builder/Complex.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/MutableBox.h"
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
#include "flang/Optimizer/Builder/Runtime/Allocatable.h"
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/Character.h"
#include "flang/Optimizer/Builder/Runtime/Command.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
@ -38,6 +40,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "flang/Optimizer/Support/Utils.h"
#include "flang/Runtime/entry-names.h"
@ -3254,6 +3257,17 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
fir::factory::associateMutableBox(builder, loc, *fPtr, getCPtrExtVal(*fPtr),
/*lbounds=*/mlir::ValueRange{});
// If the pointer is a registered CUDA fortran variable, the descriptor needs
// to be synced.
if (auto declare = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
fPtr->getAddr().getDefiningOp()))
if (declare.getMemref().getDefiningOp() &&
mlir::isa<fir::AddrOfOp>(declare.getMemref().getDefiningOp()))
if (cuf::isRegisteredDeviceAttr(declare.getDataAttr()) &&
!cuf::isCUDADeviceContext(builder.getRegion()))
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc,
declare.getMemref());
}
// C_F_PROCPOINTER

View File

@ -0,0 +1,34 @@
//===-- Allocatable.cpp -- Allocatable statements lowering ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Runtime/CUDA/descriptor.h"
using namespace Fortran::runtime::cuda;
void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
mlir::Location loc,
mlir::Value hostPtr) {
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
builder);
auto fTy = callee.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, callee, args);
}

View File

@ -8,6 +8,7 @@
#include "flang/Optimizer/Transforms/CUFOpConversion.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@ -904,16 +905,7 @@ struct CUFSyncDescriptorOpConversion
auto hostAddr = builder.create<fir::AddrOfOp>(
loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
builder);
auto fTy = callee.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, callee, args);
fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
op.erase();
return mlir::success();
}

View File

@ -1279,7 +1279,7 @@ void SimplifyIntrinsicsPass::runOnOperation() {
fir::KindMapping kindMap = fir::getKindMapping(module);
module.walk([&](mlir::Operation *op) {
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
if (cuf::isInCUDADeviceContext(op))
if (cuf::isCUDADeviceContext(op))
return;
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
mlir::StringRef funcName = callee.getLeafReference().getValue();

View File

@ -2,10 +2,31 @@
! Test lowering of CUDA pointers.
module mod1
integer, device, pointer :: x(:)
contains
subroutine allocate_pointer
real, device, pointer :: pr(:)
allocate(pr(10))
end
! CHECK-LABEL: func.func @_QPallocate_pointer()
! CHECK-LABEL: func.func @_QMmod1Pallocate_pointer()
! CHECK-COUNT-2: fir.embox %{{.*}} {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
subroutine c_f_pointer_sync
use iso_c_binding
use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
type(c_devptr) :: cd1
integer, parameter :: N = 2000
call c_f_pointer(cd1, x, (/ 2000 /))
end
! CHECK-LABEL: func.func @_QMmod1Pc_f_pointer_sync()
! CHECK: %[[ADDR_X:.*]] = fir.address_of(@_QMmod1Ex) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
! CHECK: %[[CONV:.*]] = fir.convert %[[ADDR_X]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<i8>
! CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[CONV]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> ()
end module