mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-18 17:56:50 +00:00
[NVPTX] Fix internal indirect call prototypes not obeying the ABI (#100131)
Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes https://github.com/llvm/llvm-project/issues/100055 (cherry picked from commit e0649a5dfc6b859d652318f578bc3d49674787a4)
This commit is contained in:
parent
aa425eb0e2
commit
dcc22f9844
@ -1,13 +1,3 @@
|
||||
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
|
||||
set(extra_entrypoints
|
||||
# stdio.h entrypoints
|
||||
libc.src.stdio.snprintf
|
||||
libc.src.stdio.sprintf
|
||||
libc.src.stdio.vsnprintf
|
||||
libc.src.stdio.vsprintf
|
||||
)
|
||||
endif()
|
||||
|
||||
set(TARGET_LIBC_ENTRYPOINTS
|
||||
# assert.h entrypoints
|
||||
libc.src.assert.__assert_fail
|
||||
@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
|
||||
libc.src.errno.errno
|
||||
|
||||
# stdio.h entrypoints
|
||||
${extra_entrypoints}
|
||||
libc.src.stdio.clearerr
|
||||
libc.src.stdio.fclose
|
||||
libc.src.stdio.printf
|
||||
libc.src.stdio.vprintf
|
||||
libc.src.stdio.fprintf
|
||||
libc.src.stdio.vfprintf
|
||||
libc.src.stdio.snprintf
|
||||
libc.src.stdio.sprintf
|
||||
libc.src.stdio.vsnprintf
|
||||
libc.src.stdio.vsprintf
|
||||
libc.src.stdio.feof
|
||||
libc.src.stdio.ferror
|
||||
libc.src.stdio.fflush
|
||||
|
@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
|
||||
|
||||
bool first = true;
|
||||
|
||||
const Function *F = CB.getFunction();
|
||||
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
|
||||
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
|
||||
Type *Ty = Args[i].Ty;
|
||||
@ -1471,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
|
||||
continue;
|
||||
}
|
||||
|
||||
// Indirect calls need strict ABI alignment so we disable optimizations by
|
||||
// not providing a function to optimize.
|
||||
Type *ETy = Args[i].IndirectType;
|
||||
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
|
||||
Align ParamByValAlign =
|
||||
getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
|
||||
getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
|
||||
|
||||
O << ".param .align " << ParamByValAlign.value() << " .b8 ";
|
||||
O << "_";
|
||||
|
94
llvm/test/CodeGen/NVPTX/indirect_byval.ll
Normal file
94
llvm/test/CodeGen/NVPTX/indirect_byval.ll
Normal file
@ -0,0 +1,94 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
|
||||
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.S = type { i8 }
|
||||
%struct.U = type { i64 }
|
||||
|
||||
@ptr = external global ptr, align 8
|
||||
|
||||
define internal i32 @foo() {
|
||||
; CHECK-LABEL: foo(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2];
|
||||
; CHECK-NEXT: .reg .b64 %SP;
|
||||
; CHECK-NEXT: .reg .b64 %SPL;
|
||||
; CHECK-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov.u64 %SPL, __local_depot0;
|
||||
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
|
||||
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
|
||||
; CHECK-NEXT: ld.u8 %rs1, [%SP+1];
|
||||
; CHECK-NEXT: add.u64 %rd2, %SP, 0;
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 1 .b8 param0[1];
|
||||
; CHECK-NEXT: st.param.b8 [param0+0], %rs1;
|
||||
; CHECK-NEXT: .param .b64 param1;
|
||||
; CHECK-NEXT: st.param.b64 [param1+0], %rd2;
|
||||
; CHECK-NEXT: .param .b32 retval0;
|
||||
; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
|
||||
; CHECK-NEXT: call (retval0),
|
||||
; CHECK-NEXT: %rd1,
|
||||
; CHECK-NEXT: (
|
||||
; CHECK-NEXT: param0,
|
||||
; CHECK-NEXT: param1
|
||||
; CHECK-NEXT: )
|
||||
; CHECK-NEXT: , prototype_0;
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
|
||||
; CHECK-NEXT: } // callseq 0
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%s = alloca %struct.S, align 1
|
||||
%agg.tmp = alloca %struct.S, align 1
|
||||
%0 = load ptr, ptr @ptr, align 8
|
||||
%call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
define internal i32 @bar() {
|
||||
; CHECK-LABEL: bar(
|
||||
; CHECK: // @bar
|
||||
; CHECK-NEXT: {
|
||||
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
|
||||
; CHECK-NEXT: .reg .b64 %SP;
|
||||
; CHECK-NEXT: .reg .b64 %SPL;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov.u64 %SPL, __local_depot1;
|
||||
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
|
||||
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
|
||||
; CHECK-NEXT: ld.u64 %rd2, [%SP+8];
|
||||
; CHECK-NEXT: add.u64 %rd3, %SP, 0;
|
||||
; CHECK-NEXT: { // callseq 1, 0
|
||||
; CHECK-NEXT: .param .align 8 .b8 param0[8];
|
||||
; CHECK-NEXT: st.param.b64 [param0+0], %rd2;
|
||||
; CHECK-NEXT: .param .b64 param1;
|
||||
; CHECK-NEXT: st.param.b64 [param1+0], %rd3;
|
||||
; CHECK-NEXT: .param .b32 retval0;
|
||||
; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
|
||||
; CHECK-NEXT: call (retval0),
|
||||
; CHECK-NEXT: %rd1,
|
||||
; CHECK-NEXT: (
|
||||
; CHECK-NEXT: param0,
|
||||
; CHECK-NEXT: param1
|
||||
; CHECK-NEXT: )
|
||||
; CHECK-NEXT: , prototype_1;
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
|
||||
; CHECK-NEXT: } // callseq 1
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%s = alloca %struct.U, align 8
|
||||
%agg.tmp = alloca %struct.U, align 8
|
||||
%0 = load ptr, ptr @ptr, align 8
|
||||
%call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
|
||||
ret i32 %call
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user