0
0
mirror of https://github.com/llvm/llvm-project.git synced 2025-04-21 19:57:00 +00:00

[PGO][Offload] Profile profraw generation for GPU instrumentation ()

This pull request is the second part of an ongoing effort to extends PGO
instrumentation to GPU device code and depends on . This PR makes
the following changes:

- Introduces `__llvm_write_custom_profile` to PGO compiler-rt library.
This is an external function that can be used to write profiles with
custom data to target-specific files.
- Adds `__llvm_write_custom_profile` as weak symbol to libomptarget so
that it can write the collected data to a profraw file.
- Adds `PGODump` debug flag and only displays dump when the
aforementioned flag is set
This commit is contained in:
Ethan Luis McDonough 2025-02-11 21:30:54 -08:00 committed by GitHub
parent 84e3c6ff95
commit 9e5c136d5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 209 additions and 28 deletions
compiler-rt/lib/profile
offload
openmp/docs/design

@ -304,6 +304,17 @@ int __llvm_profile_get_padding_sizes_for_counters(
*/
void __llvm_profile_set_dumped(void);
/*!
* \brief Write custom target-specific profiling data to a seperate file.
* Used by offload PGO.
*/
int __llvm_write_custom_profile(const char *Target,
const __llvm_profile_data *DataBegin,
const __llvm_profile_data *DataEnd,
const char *CountersBegin,
const char *CountersEnd, const char *NamesBegin,
const char *NamesEnd);
/*!
* This variable is defined in InstrProfilingRuntime.cpp as a hidden
* symbol. Its main purpose is to enable profile runtime user to

@ -541,6 +541,17 @@ static FILE *getFileObject(const char *OutputName) {
return fopen(OutputName, "ab");
}
static void closeFileObject(FILE *OutputFile) {
if (OutputFile == getProfileFile()) {
fflush(OutputFile);
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
lprofUnlockFileHandle(OutputFile);
}
} else {
fclose(OutputFile);
}
}
/* Write profile data to file \c OutputName. */
static int writeFile(const char *OutputName) {
int RetVal;
@ -562,15 +573,7 @@ static int writeFile(const char *OutputName) {
initFileWriter(&fileWriter, OutputFile);
RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
if (OutputFile == getProfileFile()) {
fflush(OutputFile);
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
lprofUnlockFileHandle(OutputFile);
}
} else {
fclose(OutputFile);
}
closeFileObject(OutputFile);
return RetVal;
}
@ -1359,4 +1362,107 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
return 0;
}
int __llvm_write_custom_profile(const char *Target,
const __llvm_profile_data *DataBegin,
const __llvm_profile_data *DataEnd,
const char *CountersBegin,
const char *CountersEnd, const char *NamesBegin,
const char *NamesEnd) {
int ReturnValue = 0, FilenameLength, TargetLength;
char *FilenameBuf, *TargetFilename;
const char *Filename;
/* Save old profile data */
FILE *oldFile = getProfileFile();
// Temporarily suspend getting SIGKILL when the parent exits.
int PDeathSig = lprofSuspendSigKill();
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
if (PDeathSig == 1)
lprofRestoreSigKill();
return 0;
}
/* Check if there is llvm/runtime version mismatch. */
if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
PROF_ERR("Runtime and instrumentation version mismatch : "
"expected %d, but get %d\n",
INSTR_PROF_RAW_VERSION,
(int)GET_VERSION(__llvm_profile_get_version()));
if (PDeathSig == 1)
lprofRestoreSigKill();
return -1;
}
/* Get current filename */
FilenameLength = getCurFilenameLength();
FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1);
Filename = getCurFilename(FilenameBuf, 0);
/* Check the filename. */
if (!Filename) {
PROF_ERR("Failed to write file : %s\n", "Filename not set");
if (PDeathSig == 1)
lprofRestoreSigKill();
return -1;
}
/* Allocate new space for our target-specific PGO filename */
TargetLength = strlen(Target);
TargetFilename =
(char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
/* Find file basename and path sizes */
int32_t DirEnd = FilenameLength - 1;
while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) {
DirEnd--;
}
uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize;
/* Prepend "TARGET." to current filename */
if (DirSize > 0) {
memcpy(TargetFilename, Filename, DirSize);
}
memcpy(TargetFilename + DirSize, Target, TargetLength);
TargetFilename[TargetLength + DirSize] = '.';
memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize,
BaseSize);
TargetFilename[FilenameLength + 1 + TargetLength] = 0;
/* Open and truncate target-specific PGO file */
FILE *OutputFile = fopen(TargetFilename, "w");
setProfileFile(OutputFile);
if (!OutputFile) {
PROF_ERR("Failed to open file : %s\n", TargetFilename);
if (PDeathSig == 1)
lprofRestoreSigKill();
return -1;
}
FreeHook = &free;
setupIOBuffer();
/* Write custom data */
ProfDataWriter fileWriter;
initFileWriter(&fileWriter, OutputFile);
/* Write custom data to the file */
ReturnValue = lprofWriteDataImpl(
&fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0);
closeFileObject(OutputFile);
// Restore SIGKILL.
if (PDeathSig == 1)
lprofRestoreSigKill();
/* Restore old profiling file */
setProfileFile(oldFile);
return ReturnValue;
}
#endif

@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t {
FunctionTracing = 1U << 1,
CommonIssues = 1U << 2,
AllocationTracker = 1U << 3,
PGODump = 1U << 4,
};
struct DeviceEnvironmentTy {

@ -63,14 +63,22 @@ struct __llvm_profile_data {
#include "llvm/ProfileData/InstrProfData.inc"
};
extern "C" {
extern int __attribute__((weak)) __llvm_write_custom_profile(
const char *Target, const __llvm_profile_data *DataBegin,
const __llvm_profile_data *DataEnd, const char *CountersBegin,
const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
}
/// PGO profiling data extracted from a GPU device
struct GPUProfGlobals {
SmallVector<uint8_t> NamesData;
SmallVector<SmallVector<int64_t>> Counts;
SmallVector<int64_t> Counts;
SmallVector<__llvm_profile_data> Data;
SmallVector<uint8_t> NamesData;
Triple TargetTriple;
void dump() const;
Error write() const;
};
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.

@ -206,7 +206,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
return Err;
DeviceProfileData.Counts.push_back(std::move(Counts));
DeviceProfileData.Counts.append(std::move(Counts));
} else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
// Read profiling data for this global variable
__llvm_profile_data Data{};
@ -224,15 +224,14 @@ void GPUProfGlobals::dump() const {
<< "\n";
outs() << "======== Counters =========\n";
for (const auto &Count : Counts) {
outs() << "[";
for (size_t i = 0; i < Count.size(); i++) {
if (i == 0)
outs() << " ";
outs() << Count[i] << " ";
}
outs() << "]\n";
for (size_t i = 0; i < Counts.size(); i++) {
if (i > 0 && i % 10 == 0)
outs() << "\n";
else if (i != 0)
outs() << " ";
outs() << Counts[i];
}
outs() << "\n";
outs() << "========== Data ===========\n";
for (const auto &ProfData : Data) {
@ -264,3 +263,43 @@ void GPUProfGlobals::dump() const {
Symtab.dumpNames(outs());
outs() << "===========================\n";
}
Error GPUProfGlobals::write() const {
if (!__llvm_write_custom_profile)
return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
"The compiler-rt profiling library must be linked for "
"GPU PGO to work.");
size_t DataSize = Data.size() * sizeof(__llvm_profile_data),
CountsSize = Counts.size() * sizeof(int64_t);
__llvm_profile_data *DataBegin, *DataEnd;
char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
// Initialize array of contiguous data. We need to make sure each section is
// contiguous so that the PGO library can compute deltas properly
SmallVector<uint8_t> ContiguousData(NamesData.size() + DataSize + CountsSize);
// Compute region pointers
DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize);
DataEnd =
(__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize);
CountersBegin = (char *)ContiguousData.data();
CountersEnd = (char *)(ContiguousData.data() + CountsSize);
NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize);
NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize +
NamesData.size());
// Copy data to contiguous buffer
memcpy(DataBegin, Data.data(), DataSize);
memcpy(CountersBegin, Counts.data(), CountsSize);
memcpy(NamesBegin, NamesData.data(), NamesData.size());
// Invoke compiler-rt entrypoint
int result = __llvm_write_custom_profile(TargetTriple.str().c_str(),
DataBegin, DataEnd, CountersBegin,
CountersEnd, NamesBegin, NamesEnd);
if (result != 0)
return Plugin::error("Error writing GPU PGO data to file");
return Plugin::success();
}

@ -861,8 +861,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
if (!ProfOrErr)
return ProfOrErr.takeError();
// TODO: write data to profiling file
ProfOrErr->dump();
// Dump out profdata
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
uint32_t(DeviceDebugKind::PGODump))
ProfOrErr->dump();
// Write data to profiling file
if (auto Err = ProfOrErr->write())
return Err;
}
// Delete the memory manager before deinitializing the device. Otherwise,

@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target)
if config.libomptarget_has_libc:
config.available_features.add('libc')
profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
if config.libomptarget_test_pgo:
config.available_features.add('pgo')
config.substitutions.append(("%profdata", profdata_path))
# Determine whether the test system supports unified memory.
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
@ -407,6 +409,8 @@ if config.test_fortran_compiler:
config.available_features.add('flang')
config.substitutions.append(("%flang", config.test_fortran_compiler))
config.substitutions.append(("%target_triple", config.libomptarget_current_target))
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))

@ -1,6 +1,6 @@
@AUTO_GEN_COMMENT@
config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"

@ -1,12 +1,17 @@
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
// RUN: -Xclang "-fprofile-instrument=clang"
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
// RUN: --check-prefix="CLANG-PGO"
// RUN: %libomptarget-compile-generic -fprofile-generate \
// RUN: -Xclang "-fprofile-instrument=llvm"
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
// RUN: %profdata show --all-functions --counts \
// RUN: %target_triple.llvm.profraw | %fcheck-generic \
// RUN: --check-prefix="LLVM-PGO"
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
// RUN: -Xclang "-fprofile-instrument=clang"
// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
// RUN: %profdata show --all-functions --counts \
// RUN: %target_triple.clang.profraw | %fcheck-generic \
// RUN: --check-prefix="CLANG-PGO"
// REQUIRES: gpu
// REQUIRES: pgo

@ -1522,3 +1522,4 @@ debugging features are supported.
* Enable debugging assertions in the device. ``0x01``
* Enable diagnosing common problems during offloading . ``0x4``
* Enable device malloc statistics (amdgpu only). ``0x8``
* Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``