[OPENMP][NVPTX] Enable support for lastprivates in SPMD constructs.

Previously we could not use lastprivates in SPMD constructs, patch
allows supporting lastprivates in SPMD with uninitialized runtime.

llvm-svn: 342738
This commit is contained in:
Alexey Bataev 2018-09-21 14:22:53 +00:00
parent 022bf16b41
commit 2adecff1aa
5 changed files with 157 additions and 101 deletions

View File

@ -179,6 +179,54 @@ enum NamedBarrier : unsigned {
NB_Parallel = 1,
};
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
return P1.first > P2.first;
}
static RecordDecl *buildRecordForGlobalizedVars(
ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields) {
if (EscapedDecls.empty())
return nullptr;
SmallVector<VarsDataTy, 4> GlobalizedVars;
for (const ValueDecl *D : EscapedDecls)
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
stable_sort_comparator);
// Build struct _globalized_locals_ty {
// /* globalized vars */
// };
RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
GlobalizedRD->startDefinition();
for (const auto &Pair : GlobalizedVars) {
const ValueDecl *VD = Pair.second;
QualType Type = VD->getType();
if (Type->isLValueReferenceType())
Type = C.getPointerType(Type.getNonReferenceType());
else
Type = Type.getNonReferenceType();
SourceLocation Loc = VD->getLocation();
auto *Field =
FieldDecl::Create(C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
GlobalizedRD->addDecl(Field);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
Field->addAttr(*I);
}
MappedDeclsFields.try_emplace(VD, Field);
}
GlobalizedRD->completeDefinition();
return GlobalizedRD;
}
/// Get the list of variables that can escape their declaration context.
class CheckVarsEscapingDeclContext final
: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
@ -292,51 +340,11 @@ class CheckVarsEscapingDeclContext final
}
}
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
return P1.first > P2.first;
}
void buildRecordForGlobalizedVars() {
assert(!GlobalizedRD &&
"Record for globalized variables is built already.");
if (EscapedDecls.empty())
return;
ASTContext &C = CGF.getContext();
SmallVector<VarsDataTy, 4> GlobalizedVars;
for (const ValueDecl *D : EscapedDecls)
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
stable_sort_comparator);
// Build struct _globalized_locals_ty {
// /* globalized vars */
// };
GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
GlobalizedRD->startDefinition();
for (const auto &Pair : GlobalizedVars) {
const ValueDecl *VD = Pair.second;
QualType Type = VD->getType();
if (Type->isLValueReferenceType())
Type = C.getPointerType(Type.getNonReferenceType());
else
Type = Type.getNonReferenceType();
SourceLocation Loc = VD->getLocation();
auto *Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
GlobalizedRD->addDecl(Field);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
Field->addAttr(*I);
}
MappedDeclsFields.try_emplace(VD, Field);
}
GlobalizedRD->completeDefinition();
GlobalizedRD = ::buildRecordForGlobalizedVars(
CGF.getContext(), EscapedDecls.getArrayRef(), MappedDeclsFields);
}
public:
@ -672,13 +680,6 @@ static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
return false;
}
/// Checks if the directive is the distribute clause with the lastprivate
/// clauses. This construct does not support SPMD execution mode.
static bool hasDistributeWithLastprivateClauses(const OMPExecutableDirective &D) {
return isOpenMPDistributeDirective(D.getDirectiveKind()) &&
D.hasClausesOfKind<OMPLastprivateClause>();
}
/// Check for inner (nested) SPMD construct, if any
static bool hasNestedSPMDDirective(ASTContext &Ctx,
const OMPExecutableDirective &D) {
@ -692,8 +693,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPParallelDirective(DKind) &&
!hasParallelIfNumThreadsClause(Ctx, *NestedDir) &&
!hasDistributeWithLastprivateClauses(*NestedDir))
!hasParallelIfNumThreadsClause(Ctx, *NestedDir))
return true;
if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
@ -704,16 +704,14 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPParallelDirective(DKind) &&
!hasParallelIfNumThreadsClause(Ctx, *NND) &&
!hasDistributeWithLastprivateClauses(*NND))
!hasParallelIfNumThreadsClause(Ctx, *NND))
return true;
}
}
return false;
case OMPD_target_teams:
return isOpenMPParallelDirective(DKind) &&
!hasParallelIfNumThreadsClause(Ctx, *NestedDir) &&
!hasDistributeWithLastprivateClauses(*NestedDir);
!hasParallelIfNumThreadsClause(Ctx, *NestedDir);
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
@ -786,8 +784,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
// Distribute with lastprivates requires non-SPMD execution mode.
return !hasParallelIfNumThreadsClause(Ctx, D) &&
!hasDistributeWithLastprivateClauses(D);
return !hasParallelIfNumThreadsClause(Ctx, D);
case OMPD_target_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
@ -1799,28 +1796,88 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
return OutlinedFun;
}
/// Get list of lastprivate variables from the teams distribute ... or
/// teams {distribute ...} directives.
static void
getDistributeLastprivateVars(const OMPExecutableDirective &D,
llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
"expected teams directive.");
const OMPExecutableDirective *Dir = &D;
if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
if (const Stmt *S = getSingleCompoundChild(
D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true))) {
Dir = dyn_cast<OMPExecutableDirective>(S);
if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
Dir = nullptr;
}
}
if (!Dir)
return;
for (const OMPLastprivateClause *C :
Dir->getClausesOfKind<OMPLastprivateClause>()) {
for (const Expr *E : C->getVarRefs()) {
const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
}
}
}
llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
SourceLocation Loc = D.getBeginLoc();
const RecordDecl *GlobalizedRD = nullptr;
llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
getDistributeLastprivateVars(D, LastPrivates);
if (!LastPrivates.empty())
GlobalizedRD = buildRecordForGlobalizedVars(
CGM.getContext(), LastPrivates, MappedDeclsFields);
}
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
SourceLocation &Loc;
const RecordDecl *GlobalizedRD;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields;
public:
NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
NVPTXPrePostActionTy(
SourceLocation &Loc, const RecordDecl *GlobalizedRD,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields)
: Loc(Loc), GlobalizedRD(GlobalizedRD),
MappedDeclsFields(MappedDeclsFields) {}
void Enter(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsProlog(CGF, Loc);
auto &Rt =
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
if (GlobalizedRD) {
auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().GlobalRecord = GlobalizedRD;
I->getSecond().MappedParams =
llvm::make_unique<CodeGenFunction::OMPMapVars>();
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const auto &Pair : MappedDeclsFields) {
assert(Pair.getFirst()->isCanonicalDecl() &&
"Expected canonical declaration");
Data.insert(std::make_pair(
Pair.getFirst(),
std::make_pair(Pair.getSecond(), Address::invalid())));
}
}
Rt.emitGenericVarsProlog(CGF, Loc);
}
void Exit(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF);
}
} Action(Loc);
if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
CodeGen.setAction(Action);
} Action(Loc, GlobalizedRD, MappedDeclsFields);
CodeGen.setAction(Action);
llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen);
llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
@ -1834,7 +1891,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
return;
CGBuilderTy &Bld = CGF.Builder;
@ -1892,8 +1950,6 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
I->getSecond().GlobalRecordAddr = Phi;
I->getSecond().IsInSPMDModeFlag = IsSPMD;
} else {
assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD &&
"Expected Non-SPMD construct.");
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
llvm::Value *GlobalRecordSizeArg[] = {
@ -1967,7 +2023,8 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
return;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@ -1997,8 +2054,6 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
CGF.EmitBlock(ExitBB);
} else {
assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD &&
"Expected Non-SPMD mode.");
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
I->getSecond().GlobalRecordAddr);
@ -3950,6 +4005,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
Body = CD->getBody();
NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
if (NeedToDelayGlobalization &&
getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
return;
}
if (!Body)
return;

View File

@ -8,8 +8,6 @@
#ifndef HEADER
#define HEADER
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
// CHECK: @__omp_offloading_{{.+}}_l52_exec_mode = weak constant i8 1
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
void foo() {
@ -42,7 +40,7 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})

View File

@ -8,8 +8,6 @@
#ifndef HEADER
#define HEADER
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
// CHECK: @__omp_offloading_{{.+}}_l52_exec_mode = weak constant i8 1
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
void foo() {
@ -42,7 +40,7 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})

View File

@ -8,13 +8,12 @@
#ifndef HEADER
#define HEADER
// Check that the execution mode of the target region with lastprivates on the gpu is set to Non-SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1
// Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l39}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0
// Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0
#define N 1000
#define M 10
@ -68,14 +67,16 @@ int bar(int n){
return a;
}
// CHECK_LABEL: define internal void @__omp_offloading_{{.+}}_l33_worker()
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l33(
// CHECK: call void @__kmpc_kernel_init(i32 %{{.+}}, i16 1)
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
// CHECK: [[TEAM_ALLOC:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]*
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[OUTL1:@__omp_outlined.*]]_wrapper to i8*), i16 1)
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_kernel_deinit(i16 1)
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL1]](
@ -127,7 +128,7 @@ int bar(int n){
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
// CHECK: define weak void @__omp_offloading_{{.*}}_l57(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}})
// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}})
// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}})
// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{.*}})

View File

@ -8,12 +8,11 @@
#ifndef HEADER
#define HEADER
// Check that the execution mode of the target region with lastprivates on the gpu is set to Non-SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 1
// Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0
// Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l36}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l46}}_exec_mode = weak constant i8 0
#define N 1000
#define M 10
@ -63,14 +62,16 @@ int bar(int n){
return a;
}
// CHECK_LABEL: define internal void @__omp_offloading_{{.+}}_l31_worker()
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l31(
// CHECK: call void @__kmpc_kernel_init(i32 %{{.+}}, i16 1)
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
// CHECK: [[TEAM_ALLOC:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]*
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[OUTL1:@__omp_outlined.*]]_wrapper to i8*), i16 1)
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_kernel_deinit(i16 1)
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL1]](