mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 19:46:06 +00:00

This patch fixes various issues with our prior `declare target` handling and extends it to support `omp begin declare target` as well. This started with PR49649 in mind, trying to provide a way for users to avoid the "ref" global use introduced for globals with internal linkage. From there it went down the rabbit hole, e.g., all variables, even `nohost` ones, were emitted into the device code so it was impossible to determine if "ref" was needed late in the game (based on the name only). To make it really useful, `begin declare target` was needed as it can carry the `device_type`. Not emitting variables eagerly had a ripple effect. Finally, the precedence of the (explicit) declare target list items needed to be taken into account, that meant we cannot just look for any declare target attribute to make a decision. This caused the handling of functions to require fixup as well. I tried to clean up things while I was at it, e.g., we should not "parse declarations and defintions" as part of OpenMP parsing, this will always break at some point. Instead, we keep track what region we are in and act on definitions and declarations instead, this is what we do for declare variant and other begin/end directives already. Highlights: - new diagnosis for restrictions specificed in the standard, - delayed emission of globals not mentioned in an explicit list of a declare target, - omission of `nohost` globals on the host and `host` globals on the device, - no explicit parsing of declarations in-between `omp [begin] declare variant` and the corresponding end anymore, regular parsing instead, - precedence for explicit mentions in `declare target` lists over implicit mentions in the declaration-definition-seq, and - `omp allocate` declarations will now replace an earlier emitted global, if necessary. --- Notes: The patch is larger than I hoped but it turns out that most changes do on their own lead to "inconsistent states", which seem less desirable overall. After working through this I feel the standard should remove the explicit declare target forms as the delayed emission is horrible. That said, while we delay things anyway, it seems to me we check too often for the current status even though that is often not sufficient to act upon. There seems to be a lot of duplication that can probably be trimmed down. Eagerly emitting some things seems pretty weak as an argument to keep so much logic around. --- Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D101030
156 lines
9.6 KiB
C
156 lines
9.6 KiB
C
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
|
|
// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
|
|
// expected-no-diagnostics
|
|
|
|
#ifndef HEADER
|
|
#define HEADER
|
|
|
|
extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
|
|
#pragma omp for
|
|
for (unsigned i = 33; i < 32000000; i += 7) {
|
|
a[i] = b[i] * c[i] * d[i];
|
|
}
|
|
}
|
|
|
|
#endif // HEADER
|
|
// CHECK-LABEL: define {{[^@]+}}@workshareloop_unsigned
|
|
// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]], float* [[D:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK-NEXT: entry:
|
|
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float*, align 8
|
|
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float*, align 8
|
|
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca float*, align 8
|
|
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8
|
|
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
|
|
// CHECK-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 4
|
|
// CHECK-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[P_STRIDE:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: store float* [[A]], float** [[A_ADDR]], align 8
|
|
// CHECK-NEXT: store float* [[B]], float** [[B_ADDR]], align 8
|
|
// CHECK-NEXT: store float* [[C]], float** [[C_ADDR]], align 8
|
|
// CHECK-NEXT: store float* [[D]], float** [[D_ADDR]], align 8
|
|
// CHECK-NEXT: store i32 33, i32* [[I]], align 4
|
|
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
|
|
// CHECK-NEXT: store i32* [[I]], i32** [[TMP0]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
|
|
// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4
|
|
// CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4
|
|
// CHECK-NEXT: call void @__captured_stmt(i32* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
|
|
// CHECK-NEXT: [[DOTCOUNT:%.*]] = load i32, i32* [[DOTCOUNT_ADDR]], align 4
|
|
// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]]
|
|
// CHECK: omp_loop.preheader:
|
|
// CHECK-NEXT: store i32 0, i32* [[P_LOWERBOUND]], align 4
|
|
// CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT]], 1
|
|
// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4
|
|
// CHECK-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
|
|
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
|
|
// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
|
|
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
|
|
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
|
|
// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
|
|
// CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1
|
|
// CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]]
|
|
// CHECK: omp_loop.header:
|
|
// CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
|
|
// CHECK-NEXT: br label [[OMP_LOOP_COND:%.*]]
|
|
// CHECK: omp_loop.cond:
|
|
// CHECK-NEXT: [[OMP_LOOP_CMP:%.*]] = icmp ult i32 [[OMP_LOOP_IV]], [[TMP7]]
|
|
// CHECK-NEXT: br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
|
|
// CHECK: omp_loop.body:
|
|
// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP4]]
|
|
// CHECK-NEXT: call void @__captured_stmt.1(i32* [[I]], i32 [[TMP8]], %struct.anon.0* [[AGG_CAPTURED1]])
|
|
// CHECK-NEXT: [[TMP9:%.*]] = load float*, float** [[B_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[I]], align 4
|
|
// CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64
|
|
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[IDXPROM]]
|
|
// CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
|
|
// CHECK-NEXT: [[TMP12:%.*]] = load float*, float** [[C_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4
|
|
// CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64
|
|
// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[IDXPROM2]]
|
|
// CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX3]], align 4
|
|
// CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]]
|
|
// CHECK-NEXT: [[TMP15:%.*]] = load float*, float** [[D_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[I]], align 4
|
|
// CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64
|
|
// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[IDXPROM4]]
|
|
// CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4
|
|
// CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]]
|
|
// CHECK-NEXT: [[TMP18:%.*]] = load float*, float** [[A_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[I]], align 4
|
|
// CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64
|
|
// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP18]], i64 [[IDXPROM7]]
|
|
// CHECK-NEXT: store float [[MUL6]], float* [[ARRAYIDX8]], align 4
|
|
// CHECK-NEXT: br label [[OMP_LOOP_INC]]
|
|
// CHECK: omp_loop.inc:
|
|
// CHECK-NEXT: [[OMP_LOOP_NEXT]] = add nuw i32 [[OMP_LOOP_IV]], 1
|
|
// CHECK-NEXT: br label [[OMP_LOOP_HEADER]]
|
|
// CHECK: omp_loop.exit:
|
|
// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
|
|
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
|
// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM9]])
|
|
// CHECK-NEXT: br label [[OMP_LOOP_AFTER:%.*]]
|
|
// CHECK: omp_loop.after:
|
|
// CHECK-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
|
|
// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// CHECK-NEXT: entry:
|
|
// CHECK-NEXT: [[DISTANCE_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
|
|
// CHECK-NEXT: [[DOTSTART:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[DOTSTOP:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[DOTSTEP:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: store i32* [[DISTANCE]], i32** [[DISTANCE_ADDR]], align 8
|
|
// CHECK-NEXT: store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0
|
|
// CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
|
|
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
|
|
// CHECK-NEXT: store i32 [[TMP3]], i32* [[DOTSTART]], align 4
|
|
// CHECK-NEXT: store i32 32000000, i32* [[DOTSTOP]], align 4
|
|
// CHECK-NEXT: store i32 7, i32* [[DOTSTEP]], align 4
|
|
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTSTART]], align 4
|
|
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTSTOP]], align 4
|
|
// CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
|
|
// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
|
|
// CHECK: cond.true:
|
|
// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTSTOP]], align 4
|
|
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTSTART]], align 4
|
|
// CHECK-NEXT: [[SUB:%.*]] = sub i32 [[TMP6]], [[TMP7]]
|
|
// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTSTEP]], align 4
|
|
// CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[SUB]], [[TMP8]]
|
|
// CHECK-NEXT: br label [[COND_END:%.*]]
|
|
// CHECK: cond.false:
|
|
// CHECK-NEXT: br label [[COND_END]]
|
|
// CHECK: cond.end:
|
|
// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
|
|
// CHECK-NEXT: [[TMP9:%.*]] = load i32*, i32** [[DISTANCE_ADDR]], align 8
|
|
// CHECK-NEXT: store i32 [[COND]], i32* [[TMP9]], align 4
|
|
// CHECK-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
|
|
// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) #[[ATTR1]] {
|
|
// CHECK-NEXT: entry:
|
|
// CHECK-NEXT: [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK-NEXT: [[LOGICAL_ADDR:%.*]] = alloca i32, align 4
|
|
// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
|
|
// CHECK-NEXT: store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8
|
|
// CHECK-NEXT: store i32 [[LOGICAL]], i32* [[LOGICAL_ADDR]], align 4
|
|
// CHECK-NEXT: store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
|
|
// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
|
|
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[LOGICAL_ADDR]], align 4
|
|
// CHECK-NEXT: [[MUL:%.*]] = mul i32 7, [[TMP3]]
|
|
// CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
|
|
// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8
|
|
// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP4]], align 4
|
|
// CHECK-NEXT: ret void
|
|
//
|