Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2143 lines
87 KiB
C++
Raw Normal View History

//===-LTO.cpp - LLVM Link Time Optimizer ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements functions and classes used to support LTO.
//
//===----------------------------------------------------------------------===//
#include "llvm/LTO/LTO.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StableHashing.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/StackSafetyAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/CGData/CodeGenData.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/AutoUpgrade.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMRemarkStreamer.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/RuntimeLibcalls.h"
#include "llvm/LTO/LTOBackend.h"
#include "llvm/Linker/IRMover.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Object/IRObjectFile.h"
#include "llvm/Support/Caching.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/SHA1.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/Threading.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/Support/VCSRevision.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
#include "llvm/Transforms/Utils/FunctionImportUtils.h"
#include "llvm/Transforms/Utils/SplitModule.h"
#include <optional>
#include <set>
using namespace llvm;
using namespace lto;
using namespace object;
#define DEBUG_TYPE "lto"
extern cl::opt<bool> UseNewDbgInfoFormat;
static cl::opt<bool>
DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
namespace llvm {
/// Enable global value internalization in LTO.
cl::opt<bool> EnableLTOInternalization(
"enable-lto-internalization", cl::init(true), cl::Hidden,
cl::desc("Enable global value internalization in LTO"));
Re-apply "[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF" (#107792) Fix the use-after-free bug and re-apply https://github.com/llvm/llvm-project/pull/106193 * Without the fix, the string referenced by `objSym.Name` could be destroyed even if string saver keeps a copy of the referenced string. This caused use-after-free. * The fix ([latest commit](https://github.com/llvm/llvm-project/pull/107792/commits/9776ed44cfb26172480145aed8f59ba78a6fa2ea)) updates `objSym.Name` to reference (via `StringRef`) the string saver's copy. Test: 1. For `lld/test/ELF/lto/asmundef.ll`, its test failure is reproducible with `-DLLVM_USE_SANITIZER=Address` and gone with the fix. 3. Run all tests by following https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild#try-local-changes. * Without the fix, `ELF/lto/asmundef.ll` aborted the multi-stage test at `@@@BUILD_STEP stage2/asan_ubsan check@@@`, defined [here](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh#L30) * With the fix, the [multi-stage test](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh) pass stage2 {asan, ubsan, masan}. This is also the test used by https://lab.llvm.org/buildbot/#/builders/169 **Original commit message** `StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
2024-09-09 11:16:58 -07:00
static cl::opt<bool>
LTOKeepSymbolCopies("lto-keep-symbol-copies", cl::init(false), cl::Hidden,
cl::desc("Keep copies of symbols in LTO indexing"));
[MemProf] Control availability of hot/cold operator new from LTO link Adds an LTO option to indicate that whether we are linking with an allocator that supports hot/cold operator new interfaces. If not, at the start of the LTO backends any existing memprof hot/cold attributes are removed from the IR, and we also remove memprof metadata so that post-LTO inlining doesn't add any new attributes. This is done via setting a new flag in the module summary index. It is important to communicate via the index to the LTO backends so that distributed ThinLTO handles this correctly, as they are invoked by separate clang processes and the combined index is how we communicate information from the LTO link. Specifically, for distributed ThinLTO the LTO related processes look like: ``` # Thin link: $ lld --thinlto-index-only obj1.o ... objN.o -llib ... # ThinLTO backends: $ clang -x ir obj1.o -fthinlto-index=obj1.o.thinlto.bc -c -O2 ... $ clang -x ir objN.o -fthinlto-index=objN.o.thinlto.bc -c -O2 ``` It is during the thin link (lld --thinlto-index-only) that we have visibility into linker dependences and want to be able to pass the new option via -Wl,-supports-hot-cold-new. This will be recorded in the summary indexes created for the distributed backend processes (*.thinlto.bc) and queried from there, so that we don't need to know during those individual clang backends what allocation library was linked. Since in-process ThinLTO and regular LTO also use a combined index, for consistency we query the flag out of the index in all LTO backends. Additionally, when the LTO option is disabled, exit early from the MemProfContextDisambiguation handling performed during LTO, as this is unnecessary. Depends on D149117 and D149192. Differential Revision: https://reviews.llvm.org/D149215
2023-05-06 07:14:56 -07:00
/// Indicate we are linking with an allocator that supports hot/cold operator
/// new interfaces.
extern cl::opt<bool> SupportsHotColdNew;
/// Enable MemProf context disambiguation for thin link.
extern cl::opt<bool> EnableMemProfContextDisambiguation;
} // namespace llvm
[ThinLTO] Consolidate cache key computation between new/old LTO APIs Summary: The old legacy LTO API had a separate cache key computation, which was a subset of the cache key computation in the new LTO API (from what I can tell this is largely just because certain features such as CFI, dsoLocal, etc are only utilized via the new LTO API). However, having separate computations is unnecessary (much of the code is duplicated), and can lead to bugs when adding new optimizations if both cache computation algorithms aren't updated properly - it's much easier to maintain if we have a single facility. This patch refactors the old LTO API code to use the cache key computation from the new LTO API. To do this, we set up an lto::Config object and fill in the fields that the old LTO was hashing (the others will just use the defaults). There are two notable changes: - I added a Freestanding flag to the LTO Config. Currently this is only used by the legacy LTO API. In the patch that added it (D30791) I had asked about adding it to the new LTO API, but it looks like that was not addressed. This should probably be discussed as a follow up to this change, as it is orthogonal. - The legacy LTO API had some code that was hashing the GUID of all preserved symbols defined in the module. I looked back at the history of this (which was added with the original hashing in the legacy LTO API in D18494), and there is a comment in the review thread that it was added in preparation for future internalization. We now do the internalization of course, and that is handled in the new LTO API cache key computation by hashing the recorded linkage type of all defined globals. Therefore I didn't try to move over and keep the preserved symbols handling. Reviewers: steven_wu, pcc Subscribers: mehdi_amini, inglorion, eraman, dexonsmith, dang, llvm-commits Differential Revision: https://reviews.llvm.org/D54635 llvm-svn: 347592
2018-11-26 20:40:37 +00:00
// Computes a unique hash for the Module considering the current list of
// export/import and other global analysis results.
// Returns the hash in its hexadecimal representation.
std::string llvm::computeLTOCacheKey(
const Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
const GVSummaryMapTy &DefinedGlobals,
const DenseSet<GlobalValue::GUID> &CfiFunctionDefs,
const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) {
// Compute the unique hash for this entry.
// This is based on the current compiler version, the module itself, the
// export list, the hash for every single module in the import list, the
// list of ResolvedODR for the module, and the list of preserved symbols.
SHA1 Hasher;
// Start with the compiler revision
Hasher.update(LLVM_VERSION_STRING);
#ifdef LLVM_REVISION
Hasher.update(LLVM_REVISION);
#endif
// Include the parts of the LTO configuration that affect code generation.
auto AddString = [&](StringRef Str) {
Hasher.update(Str);
Hasher.update(ArrayRef<uint8_t>{0});
};
auto AddUnsigned = [&](unsigned I) {
uint8_t Data[4];
support::endian::write32le(Data, I);
Hasher.update(Data);
};
auto AddUint64 = [&](uint64_t I) {
uint8_t Data[8];
support::endian::write64le(Data, I);
Hasher.update(Data);
};
Reland "[ThinLTO] Populate declaration import status except for distributed ThinLTO under a default-off new option" (#95482) Make `FunctionsToImportTy` an `unordered_map` rather than `DenseMap`. Credit goes to jvoung@ for the 'DenseMap -> unordered_map' change. This is a reland of https://github.com/llvm/llvm-project/pull/92718 * `DenseMap` allocates space for a large number of key/value pairs and wastes space when the number of elements are small. * While init bucket size is zero [1], it quickly allocates buckets for 64 elements [2] when the number of elements is small (for example, 3 or 4 elements). The programmer manual [3] also mentions it could waste space. * Experiments show `FunctionsToImportTy.size()` is smaller than 4 for multiple binaries with high indexing ram usage. `unordered_map` grows factor is at most 2 in llvm libc [4] for insert operations. With this change, `ComputeCrossModuleImport` ram increase is smaller than 0.5G on a couple of binaries with high indexing ram usage. A wider range of (pre-release) tests pass. [1] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/llvm/include/llvm/ADT/DenseMap.h#L431-L432 [2] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/llvm/include/llvm/ADT/DenseMap.h#L849 [3] https://llvm.org/docs/ProgrammersManual.html#llvm-adt-densemap-h [4] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/libcxx/include/__hash_table#L1525-L1526 **Original commit message** The goal is to populate `declaration` import status if a new flag `-import-declaration` is on. * For in-process ThinLTO, the `declaration` status is visible to backend `function-import` pass, so `FunctionImporter::importFunctions` should read the import status and be no-op for declaration summaries. Basically, the postlink pipeline is updated to keep its current behavior (import definitions), but not updated to handle `declaration` summaries. Two use cases ([better call-graph sort](https://discourse.llvm.org/t/rfc-for-better-call-graph-sort-build-a-more-complete-call-graph-by-adding-more-indirect-call-edges/74029#support-cross-module-function-declaration-import-5) or [cross-module auto-init](https://github.com/llvm/llvm-project/pull/87597#discussion_r1556067195)) would use this bit differently. * For distributed ThinLTO, the `declaration` status is not serialized to bitcode. As discussed, https://github.com/llvm/llvm-project/pull/87600 will do this.
2024-06-20 10:50:31 -07:00
auto AddUint8 = [&](const uint8_t I) {
Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&I, 1));
};
AddString(Conf.CPU);
// FIXME: Hash more of Options. For now all clients initialize Options from
// command-line flags (which is unsupported in production), but may set
// X86RelaxRelocations. The clang driver can also pass FunctionSections,
// DataSections and DebuggerTuning via command line flags.
AddUnsigned(Conf.Options.MCOptions.X86RelaxRelocations);
AddUnsigned(Conf.Options.FunctionSections);
AddUnsigned(Conf.Options.DataSections);
AddUnsigned((unsigned)Conf.Options.DebuggerTuning);
for (auto &A : Conf.MAttrs)
AddString(A);
if (Conf.RelocModel)
AddUnsigned(*Conf.RelocModel);
else
AddUnsigned(-1);
if (Conf.CodeModel)
AddUnsigned(*Conf.CodeModel);
else
AddUnsigned(-1);
for (const auto &S : Conf.MllvmArgs)
AddString(S);
AddUnsigned(static_cast<int>(Conf.CGOptLevel));
AddUnsigned(static_cast<int>(Conf.CGFileType));
AddUnsigned(Conf.OptLevel);
[ThinLTO] Consolidate cache key computation between new/old LTO APIs Summary: The old legacy LTO API had a separate cache key computation, which was a subset of the cache key computation in the new LTO API (from what I can tell this is largely just because certain features such as CFI, dsoLocal, etc are only utilized via the new LTO API). However, having separate computations is unnecessary (much of the code is duplicated), and can lead to bugs when adding new optimizations if both cache computation algorithms aren't updated properly - it's much easier to maintain if we have a single facility. This patch refactors the old LTO API code to use the cache key computation from the new LTO API. To do this, we set up an lto::Config object and fill in the fields that the old LTO was hashing (the others will just use the defaults). There are two notable changes: - I added a Freestanding flag to the LTO Config. Currently this is only used by the legacy LTO API. In the patch that added it (D30791) I had asked about adding it to the new LTO API, but it looks like that was not addressed. This should probably be discussed as a follow up to this change, as it is orthogonal. - The legacy LTO API had some code that was hashing the GUID of all preserved symbols defined in the module. I looked back at the history of this (which was added with the original hashing in the legacy LTO API in D18494), and there is a comment in the review thread that it was added in preparation for future internalization. We now do the internalization of course, and that is handled in the new LTO API cache key computation by hashing the recorded linkage type of all defined globals. Therefore I didn't try to move over and keep the preserved symbols handling. Reviewers: steven_wu, pcc Subscribers: mehdi_amini, inglorion, eraman, dexonsmith, dang, llvm-commits Differential Revision: https://reviews.llvm.org/D54635 llvm-svn: 347592
2018-11-26 20:40:37 +00:00
AddUnsigned(Conf.Freestanding);
AddString(Conf.OptPipeline);
AddString(Conf.AAPipeline);
AddString(Conf.OverrideTriple);
AddString(Conf.DefaultTriple);
AddString(Conf.DwoDir);
// Include the hash for the current module
auto ModHash = Index.getModuleHash(ModuleID);
Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
[ThinLTO] Use a set rather than a map to track exported ValueInfos. (#97360) https://github.com/llvm/llvm-project/pull/95482 is a reland of https://github.com/llvm/llvm-project/pull/88024. https://github.com/llvm/llvm-project/pull/95482 keeps indexing memory usage reasonable by using unordered_map and doesn't make other changes to originally reviewed code. While discussing possible ways to minimize indexing memory usage, Teresa asked whether I need `ExportSetTy` as a map or a set is sufficient. This PR implements the idea. It uses a set rather than a map to track exposed ValueInfos. Currently, `ExportLists` has two use cases, and neither needs to track a ValueInfo's import/export status. So using a set is sufficient and correct. 1) In both in-process and distributed ThinLTO, it's used to decide if a function or global variable is visible [1] from another module after importing creates additional cross-module references. * If a cross-module call edge is seen today, the callee must be visible to another module without keeping track of its export status already. For instance, this [2] is how callees of direct calls get exported. 2) For in-process ThinLTO [3], it's used to compute lto cache key. * The cache key computation already hashes [4] 'ImportList' , and 'ExportList' is determined by 'ImportList'. So it's fine to not track 'import type' for export list. [1] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1815-L1819 [2] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1783-L1794 [3] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1494-L1496 [4] https://github.com/llvm/llvm-project/blob/b76100e220591fab2bf0a4917b216439f7aa4b09/llvm/lib/LTO/LTO.cpp#L194-L222
2024-07-03 13:15:17 -07:00
// TODO: `ExportList` is determined by `ImportList`. Since `ImportList` is
// used to compute cache key, we could omit hashing `ExportList` here.
std::vector<uint64_t> ExportsGUID;
ExportsGUID.reserve(ExportList.size());
[ThinLTO] Use a set rather than a map to track exported ValueInfos. (#97360) https://github.com/llvm/llvm-project/pull/95482 is a reland of https://github.com/llvm/llvm-project/pull/88024. https://github.com/llvm/llvm-project/pull/95482 keeps indexing memory usage reasonable by using unordered_map and doesn't make other changes to originally reviewed code. While discussing possible ways to minimize indexing memory usage, Teresa asked whether I need `ExportSetTy` as a map or a set is sufficient. This PR implements the idea. It uses a set rather than a map to track exposed ValueInfos. Currently, `ExportLists` has two use cases, and neither needs to track a ValueInfo's import/export status. So using a set is sufficient and correct. 1) In both in-process and distributed ThinLTO, it's used to decide if a function or global variable is visible [1] from another module after importing creates additional cross-module references. * If a cross-module call edge is seen today, the callee must be visible to another module without keeping track of its export status already. For instance, this [2] is how callees of direct calls get exported. 2) For in-process ThinLTO [3], it's used to compute lto cache key. * The cache key computation already hashes [4] 'ImportList' , and 'ExportList' is determined by 'ImportList'. So it's fine to not track 'import type' for export list. [1] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1815-L1819 [2] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1783-L1794 [3] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1494-L1496 [4] https://github.com/llvm/llvm-project/blob/b76100e220591fab2bf0a4917b216439f7aa4b09/llvm/lib/LTO/LTO.cpp#L194-L222
2024-07-03 13:15:17 -07:00
for (const auto &VI : ExportList)
ExportsGUID.push_back(VI.getGUID());
// Sort the export list elements GUIDs.
llvm::sort(ExportsGUID);
[ThinLTO] Use a set rather than a map to track exported ValueInfos. (#97360) https://github.com/llvm/llvm-project/pull/95482 is a reland of https://github.com/llvm/llvm-project/pull/88024. https://github.com/llvm/llvm-project/pull/95482 keeps indexing memory usage reasonable by using unordered_map and doesn't make other changes to originally reviewed code. While discussing possible ways to minimize indexing memory usage, Teresa asked whether I need `ExportSetTy` as a map or a set is sufficient. This PR implements the idea. It uses a set rather than a map to track exposed ValueInfos. Currently, `ExportLists` has two use cases, and neither needs to track a ValueInfo's import/export status. So using a set is sufficient and correct. 1) In both in-process and distributed ThinLTO, it's used to decide if a function or global variable is visible [1] from another module after importing creates additional cross-module references. * If a cross-module call edge is seen today, the callee must be visible to another module without keeping track of its export status already. For instance, this [2] is how callees of direct calls get exported. 2) For in-process ThinLTO [3], it's used to compute lto cache key. * The cache key computation already hashes [4] 'ImportList' , and 'ExportList' is determined by 'ImportList'. So it's fine to not track 'import type' for export list. [1] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1815-L1819 [2] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1783-L1794 [3] https://github.com/llvm/llvm-project/blob/66cd8ec4c08252ebc73c82e4883a8da247ed146b/llvm/lib/LTO/LTO.cpp#L1494-L1496 [4] https://github.com/llvm/llvm-project/blob/b76100e220591fab2bf0a4917b216439f7aa4b09/llvm/lib/LTO/LTO.cpp#L194-L222
2024-07-03 13:15:17 -07:00
for (auto GUID : ExportsGUID)
Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
// Order using module hash, to be both independent of module name and
// module order.
auto Comp = [&](const std::pair<StringRef, GlobalValue::GUID> &L,
const std::pair<StringRef, GlobalValue::GUID> &R) {
return std::make_pair(Index.getModule(L.first)->second, L.second) <
std::make_pair(Index.getModule(R.first)->second, R.second);
};
FunctionImporter::SortedImportList SortedImportList(ImportList, Comp);
// Count the number of imports for each source module.
DenseMap<StringRef, unsigned> ModuleToNumImports;
for (const auto &[FromModule, GUID, Type] : SortedImportList)
++ModuleToNumImports[FromModule];
std::optional<StringRef> LastModule;
for (const auto &[FromModule, GUID, Type] : SortedImportList) {
if (LastModule != FromModule) {
// Include the hash for every module we import functions from. The set of
// imported symbols for each module may affect code generation and is
// sensitive to link order, so include that as well.
LastModule = FromModule;
auto ModHash = Index.getModule(FromModule)->second;
Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
AddUint64(ModuleToNumImports[FromModule]);
Reland "[ThinLTO] Populate declaration import status except for distributed ThinLTO under a default-off new option" (#95482) Make `FunctionsToImportTy` an `unordered_map` rather than `DenseMap`. Credit goes to jvoung@ for the 'DenseMap -> unordered_map' change. This is a reland of https://github.com/llvm/llvm-project/pull/92718 * `DenseMap` allocates space for a large number of key/value pairs and wastes space when the number of elements are small. * While init bucket size is zero [1], it quickly allocates buckets for 64 elements [2] when the number of elements is small (for example, 3 or 4 elements). The programmer manual [3] also mentions it could waste space. * Experiments show `FunctionsToImportTy.size()` is smaller than 4 for multiple binaries with high indexing ram usage. `unordered_map` grows factor is at most 2 in llvm libc [4] for insert operations. With this change, `ComputeCrossModuleImport` ram increase is smaller than 0.5G on a couple of binaries with high indexing ram usage. A wider range of (pre-release) tests pass. [1] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/llvm/include/llvm/ADT/DenseMap.h#L431-L432 [2] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/llvm/include/llvm/ADT/DenseMap.h#L849 [3] https://llvm.org/docs/ProgrammersManual.html#llvm-adt-densemap-h [4] https://github.com/llvm/llvm-project/blob/ad79a14c9e5ec4a369eed4adf567c22cc029863f/libcxx/include/__hash_table#L1525-L1526 **Original commit message** The goal is to populate `declaration` import status if a new flag `-import-declaration` is on. * For in-process ThinLTO, the `declaration` status is visible to backend `function-import` pass, so `FunctionImporter::importFunctions` should read the import status and be no-op for declaration summaries. Basically, the postlink pipeline is updated to keep its current behavior (import definitions), but not updated to handle `declaration` summaries. Two use cases ([better call-graph sort](https://discourse.llvm.org/t/rfc-for-better-call-graph-sort-build-a-more-complete-call-graph-by-adding-more-indirect-call-edges/74029#support-cross-module-function-declaration-import-5) or [cross-module auto-init](https://github.com/llvm/llvm-project/pull/87597#discussion_r1556067195)) would use this bit differently. * For distributed ThinLTO, the `declaration` status is not serialized to bitcode. As discussed, https://github.com/llvm/llvm-project/pull/87600 will do this.
2024-06-20 10:50:31 -07:00
}
AddUint64(GUID);
AddUint8(Type);
}
// Include the hash for the resolved ODR.
for (auto &Entry : ResolvedODR) {
Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.first,
sizeof(GlobalValue::GUID)));
Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.second,
sizeof(GlobalValue::LinkageTypes)));
}
// Members of CfiFunctionDefs and CfiFunctionDecls that are referenced or
// defined in this module.
std::set<GlobalValue::GUID> UsedCfiDefs;
std::set<GlobalValue::GUID> UsedCfiDecls;
// Typeids used in this module.
std::set<GlobalValue::GUID> UsedTypeIds;
auto AddUsedCfiGlobal = [&](GlobalValue::GUID ValueGUID) {
if (CfiFunctionDefs.contains(ValueGUID))
UsedCfiDefs.insert(ValueGUID);
if (CfiFunctionDecls.contains(ValueGUID))
UsedCfiDecls.insert(ValueGUID);
};
auto AddUsedThings = [&](GlobalValueSummary *GS) {
if (!GS) return;
AddUnsigned(GS->getVisibility());
AddUnsigned(GS->isLive());
AddUnsigned(GS->canAutoHide());
for (const ValueInfo &VI : GS->refs()) {
AddUnsigned(VI.isDSOLocal(Index.withDSOLocalPropagation()));
AddUsedCfiGlobal(VI.getGUID());
}
if (auto *GVS = dyn_cast<GlobalVarSummary>(GS)) {
AddUnsigned(GVS->maybeReadOnly());
AddUnsigned(GVS->maybeWriteOnly());
}
if (auto *FS = dyn_cast<FunctionSummary>(GS)) {
for (auto &TT : FS->type_tests())
UsedTypeIds.insert(TT);
for (auto &TT : FS->type_test_assume_vcalls())
UsedTypeIds.insert(TT.GUID);
for (auto &TT : FS->type_checked_load_vcalls())
UsedTypeIds.insert(TT.GUID);
for (auto &TT : FS->type_test_assume_const_vcalls())
UsedTypeIds.insert(TT.VFunc.GUID);
for (auto &TT : FS->type_checked_load_const_vcalls())
UsedTypeIds.insert(TT.VFunc.GUID);
for (auto &ET : FS->calls()) {
AddUnsigned(ET.first.isDSOLocal(Index.withDSOLocalPropagation()));
AddUsedCfiGlobal(ET.first.getGUID());
}
}
};
// Include the hash for the linkage type to reflect internalization and weak
// resolution, and collect any used type identifier resolutions.
for (auto &GS : DefinedGlobals) {
GlobalValue::LinkageTypes Linkage = GS.second->linkage();
Hasher.update(
ArrayRef<uint8_t>((const uint8_t *)&Linkage, sizeof(Linkage)));
AddUsedCfiGlobal(GS.first);
AddUsedThings(GS.second);
}
// Imported functions may introduce new uses of type identifier resolutions,
// so we need to collect their used resolutions as well.
for (const auto &[FromModule, GUID, Type] : SortedImportList) {
GlobalValueSummary *S = Index.findSummaryInModule(GUID, FromModule);
AddUsedThings(S);
// If this is an alias, we also care about any types/etc. that the aliasee
// may reference.
if (auto *AS = dyn_cast_or_null<AliasSummary>(S))
AddUsedThings(AS->getBaseObject());
}
auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
AddString(TId);
AddUnsigned(S.TTRes.TheKind);
AddUnsigned(S.TTRes.SizeM1BitWidth);
AddUint64(S.TTRes.AlignLog2);
AddUint64(S.TTRes.SizeM1);
AddUint64(S.TTRes.BitMask);
AddUint64(S.TTRes.InlineBits);
AddUint64(S.WPDRes.size());
for (auto &WPD : S.WPDRes) {
AddUnsigned(WPD.first);
AddUnsigned(WPD.second.TheKind);
AddString(WPD.second.SingleImplName);
AddUint64(WPD.second.ResByArg.size());
for (auto &ByArg : WPD.second.ResByArg) {
AddUint64(ByArg.first.size());
for (uint64_t Arg : ByArg.first)
AddUint64(Arg);
AddUnsigned(ByArg.second.TheKind);
AddUint64(ByArg.second.Info);
AddUnsigned(ByArg.second.Byte);
AddUnsigned(ByArg.second.Bit);
}
}
};
// Include the hash for all type identifiers used by this module.
for (GlobalValue::GUID TId : UsedTypeIds) {
auto TidIter = Index.typeIds().equal_range(TId);
for (const auto &I : make_range(TidIter))
AddTypeIdSummary(I.second.first, I.second.second);
}
AddUnsigned(UsedCfiDefs.size());
for (auto &V : UsedCfiDefs)
AddUint64(V);
AddUnsigned(UsedCfiDecls.size());
for (auto &V : UsedCfiDecls)
AddUint64(V);
if (!Conf.SampleProfile.empty()) {
auto FileOrErr = MemoryBuffer::getFile(Conf.SampleProfile);
if (FileOrErr) {
Hasher.update(FileOrErr.get()->getBuffer());
if (!Conf.ProfileRemapping.empty()) {
FileOrErr = MemoryBuffer::getFile(Conf.ProfileRemapping);
if (FileOrErr)
Hasher.update(FileOrErr.get()->getBuffer());
}
}
}
return toHex(Hasher.result());
}
std::string llvm::recomputeLTOCacheKey(const std::string &Key,
StringRef ExtraID) {
SHA1 Hasher;
auto AddString = [&](StringRef Str) {
Hasher.update(Str);
Hasher.update(ArrayRef<uint8_t>{0});
};
AddString(Key);
AddString(ExtraID);
return toHex(Hasher.result());
}
static void thinLTOResolvePrevailingGUID(
const Config &C, ValueInfo VI,
DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias,
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing,
function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
recordNewLinkage,
const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
GlobalValue::VisibilityTypes Visibility =
C.VisibilityScheme == Config::ELF ? VI.getELFVisibility()
: GlobalValue::DefaultVisibility;
for (auto &S : VI.getSummaryList()) {
GlobalValue::LinkageTypes OriginalLinkage = S->linkage();
// Ignore local and appending linkage values since the linker
// doesn't resolve them.
if (GlobalValue::isLocalLinkage(OriginalLinkage) ||
GlobalValue::isAppendingLinkage(S->linkage()))
continue;
// We need to emit only one of these. The prevailing module will keep it,
// but turned into a weak, while the others will drop it when possible.
// This is both a compile-time optimization and a correctness
// transformation. This is necessary for correctness when we have exported
// a reference - we need to convert the linkonce to weak to
// ensure a copy is kept to satisfy the exported reference.
// FIXME: We may want to split the compile time and correctness
// aspects into separate routines.
if (isPrevailing(VI.getGUID(), S.get())) {
if (GlobalValue::isLinkOnceLinkage(OriginalLinkage)) {
S->setLinkage(GlobalValue::getWeakLinkage(
GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
// The kept copy is eligible for auto-hiding (hidden visibility) if all
// copies were (i.e. they were all linkonce_odr global unnamed addr).
// If any copy is not (e.g. it was originally weak_odr), then the symbol
// must remain externally available (e.g. a weak_odr from an explicitly
// instantiated template). Additionally, if it is in the
// GUIDPreservedSymbols set, that means that it is visibile outside
// the summary (e.g. in a native object or a bitcode file without
// summary), and in that case we cannot hide it as it isn't possible to
// check all copies.
S->setCanAutoHide(VI.canAutoHide() &&
!GUIDPreservedSymbols.count(VI.getGUID()));
}
if (C.VisibilityScheme == Config::FromPrevailing)
Visibility = S->getVisibility();
}
// Alias and aliasee can't be turned into available_externally.
else if (!isa<AliasSummary>(S.get()) &&
!GlobalInvolvedWithAlias.count(S.get()))
S->setLinkage(GlobalValue::AvailableExternallyLinkage);
// For ELF, set visibility to the computed visibility from summaries. We
// don't track visibility from declarations so this may be more relaxed than
// the most constraining one.
if (C.VisibilityScheme == Config::ELF)
S->setVisibility(Visibility);
if (S->linkage() != OriginalLinkage)
recordNewLinkage(S->modulePath(), VI.getGUID(), S->linkage());
}
if (C.VisibilityScheme == Config::FromPrevailing) {
for (auto &S : VI.getSummaryList()) {
GlobalValue::LinkageTypes OriginalLinkage = S->linkage();
if (GlobalValue::isLocalLinkage(OriginalLinkage) ||
GlobalValue::isAppendingLinkage(S->linkage()))
continue;
S->setVisibility(Visibility);
}
}
}
/// Resolve linkage for prevailing symbols in the \p Index.
//
// We'd like to drop these functions if they are no longer referenced in the
// current module. However there is a chance that another module is still
// referencing them because of the import. We make sure we always emit at least
// one copy.
void llvm::thinLTOResolvePrevailingInIndex(
const Config &C, ModuleSummaryIndex &Index,
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing,
function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
recordNewLinkage,
const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
// We won't optimize the globals that are referenced by an alias for now
// Ideally we should turn the alias into a global and duplicate the definition
// when needed.
DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
for (auto &I : Index)
for (auto &S : I.second.SummaryList)
if (auto AS = dyn_cast<AliasSummary>(S.get()))
GlobalInvolvedWithAlias.insert(&AS->getAliasee());
for (auto &I : Index)
thinLTOResolvePrevailingGUID(C, Index.getValueInfo(I),
GlobalInvolvedWithAlias, isPrevailing,
recordNewLinkage, GUIDPreservedSymbols);
}
static void thinLTOInternalizeAndPromoteGUID(
2019-11-19 15:51:25 +03:00
ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported,
[ThinLTO] Fix handling of weak interposable symbols Summary: Keep aliasees alive if their alias is live, otherwise we end up with an alias to a declaration, which is invalid. This can happen when the aliasee is weak and non-prevailing. This fix exposed the fact that we were then attempting to internalize the weak symbol, which was not exported as it was not prevailing. We should not internalize interposable symbols in general, unless this is the prevailing copy, since it can lead to incorrect inlining and other optimizations. Most of the changes in this patch are due to the restructuring required to pass down the prevailing callback. Finally, while implementing the test cases, I found that in the case of a weak aliasee that is still marked not live because its alias isn't live, after dropping the definition we incorrectly marked the declaration with weak linkage when resolving prevailing symbols in the module. This was due to some special case handling for symbols marked WeakLinkage in the summary located before instead of after a subsequent check for the symbol being a declaration. It turns out that we don't actually need this special case handling any more (looking back at the history, when that was added the code was structured quite differently) - we will correctly mark with weak linkage further below when the definition hasn't been dropped. Fixes PR42542. Reviewers: pcc Subscribers: mehdi_amini, inglorion, steven_wu, dexonsmith, dang, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66264 llvm-svn: 369766
2019-08-23 15:18:58 +00:00
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing) {
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
auto ExternallyVisibleCopies =
llvm::count_if(VI.getSummaryList(),
[](const std::unique_ptr<GlobalValueSummary> &Summary) {
return !GlobalValue::isLocalLinkage(Summary->linkage());
});
2019-11-19 15:51:25 +03:00
for (auto &S : VI.getSummaryList()) {
// First see if we need to promote an internal value because it is not
// exported.
if (isExported(S->modulePath(), VI)) {
if (GlobalValue::isLocalLinkage(S->linkage()))
S->setLinkage(GlobalValue::ExternalLinkage);
continue;
}
// Otherwise, see if we can internalize.
if (!EnableLTOInternalization)
continue;
// Non-exported values with external linkage can be internalized.
if (GlobalValue::isExternalLinkage(S->linkage())) {
S->setLinkage(GlobalValue::InternalLinkage);
continue;
}
// Non-exported function and variable definitions with a weak-for-linker
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
// linkage can be internalized in certain cases. The minimum legality
// requirements would be that they are not address taken to ensure that we
// don't break pointer equality checks, and that variables are either read-
// or write-only. For functions, this is the case if either all copies are
// [local_]unnamed_addr, or we can propagate reference edge attributes
// (which is how this is guaranteed for variables, when analyzing whether
// they are read or write-only).
//
// However, we only get to this code for weak-for-linkage values in one of
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
// two cases:
// 1) The prevailing copy is not in IR (it is in native code).
// 2) The prevailing copy in IR is not exported from its module.
// Additionally, at least for the new LTO API, case 2 will only happen if
// there is exactly one definition of the value (i.e. in exactly one
// module), as duplicate defs are result in the value being marked exported.
// Likely, users of the legacy LTO API are similar, however, currently there
// are llvm-lto based tests of the legacy LTO API that do not mark
// duplicate linkonce_odr copies as exported via the tool, so we need
// to handle that case below by checking the number of copies.
//
// Generally, we only want to internalize a weak-for-linker value in case
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
// 2, because in case 1 we cannot see how the value is used to know if it
// is read or write-only. We also don't want to bloat the binary with
// multiple internalized copies of non-prevailing linkonce/weak functions.
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
// Note if we don't internalize, we will convert non-prevailing copies to
// available_externally anyway, so that we drop them after inlining. The
// only reason to internalize such a function is if we indeed have a single
// copy, because internalizing it won't increase binary size, and enables
// use of inliner heuristics that are more aggressive in the face of a
// single call to a static (local). For variables, internalizing a read or
// write only variable can enable more aggressive optimization. However, we
// already perform this elsewhere in the ThinLTO backend handling for
// read or write-only variables (processGlobalForThinLTO).
//
// Therefore, only internalize linkonce/weak if there is a single copy, that
// is prevailing in this IR module. We can do so aggressively, without
[ThinLTO] Fix internalization decisions for weak/linkonce ODR This fixes a runtime error that occurred due to incorrect internalization of linkonce_odr functions where function pointer equality was broken. This was hit because the prevailing copy was in a native object, so the IR copies were not exported, and the existing code internalized all of the IR copies. It could be fixed by guarding this internalization on whether the defs are (local_)unnamed_addr, meaning that their address is not significant (which we have in the summary currently for linkonce_odr via the CanAutoHide flag). Or we can propagate reference attributes as we do when determining whether a global variable is read or write-only (reference edges are annotated with whether they are read-only, write-only, or neither, and taking the address of a function would result in a reference edge to the function that is not read or write-only). However, this exposed a larger issue with the internalization handling. Looking at test cases, it appears the intent is to internalize when there is a single definition of a linkonce/weak ODR symbol (that isn't exported). This makes sense in the case of functions, because the inliner can apply its last call to static heuristic when appropriate. In the case where there is no prevailing copy in IR, internalizing all of the IR copies of a linkonce_odr, even if legal, just increases binary size. In that case it is better to fall back to the normal handling of converting all non-prevailing copies to available_externally so that they are eliminated after inlining. In the case of variables, the existing code was attempting to internalize the non-exported linkonce/weak ODR variables if they were read or write-only. While this is legal (we propagate reference attributes to determine this information), we don't even need to internalize these here as there is later separate handling that internalizes read and write-only variables when we process the module at the start of the ThinLTO backend (processGlobalForThinLTO). Instead, we can also internalize any non-exported variable when there is only one (IR) definition, which is prevailing. And in that case, we don't need to require that it is read or write-only, since we are guaranteed that all uses must use that single definition. In the new LTO API, if there are multiple defs of a linkonce or weak ODR it will be marked exported, but it isn't clear that this will always be true for the legacy LTO API. Therefore, require that there is only a single (non-local) def, and that it is prevailing. The test cases changes are both to reflect the change in the handling of linkonce_odr IR copies where the prevailing def is not in IR (the main correctness bug fix here), and to reflect the more aggressive internalization of variables when there is only a single def, it is in IR, and not exported. I've also added some additional testing via the new LTO API. Differential Revision: https://reviews.llvm.org/D151965
2023-06-01 11:07:05 -07:00
// requiring the address to be insignificant, or that a variable be read or
// write-only.
if (!GlobalValue::isWeakForLinker(S->linkage()) ||
GlobalValue::isExternalWeakLinkage(S->linkage()))
continue;
if (isPrevailing(VI.getGUID(), S.get()) && ExternallyVisibleCopies == 1)
S->setLinkage(GlobalValue::InternalLinkage);
}
}
// Update the linkages in the given \p Index to mark exported values
// as external and non-exported values as internal.
void llvm::thinLTOInternalizeAndPromoteInIndex(
ModuleSummaryIndex &Index,
function_ref<bool(StringRef, ValueInfo)> isExported,
[ThinLTO] Fix handling of weak interposable symbols Summary: Keep aliasees alive if their alias is live, otherwise we end up with an alias to a declaration, which is invalid. This can happen when the aliasee is weak and non-prevailing. This fix exposed the fact that we were then attempting to internalize the weak symbol, which was not exported as it was not prevailing. We should not internalize interposable symbols in general, unless this is the prevailing copy, since it can lead to incorrect inlining and other optimizations. Most of the changes in this patch are due to the restructuring required to pass down the prevailing callback. Finally, while implementing the test cases, I found that in the case of a weak aliasee that is still marked not live because its alias isn't live, after dropping the definition we incorrectly marked the declaration with weak linkage when resolving prevailing symbols in the module. This was due to some special case handling for symbols marked WeakLinkage in the summary located before instead of after a subsequent check for the symbol being a declaration. It turns out that we don't actually need this special case handling any more (looking back at the history, when that was added the code was structured quite differently) - we will correctly mark with weak linkage further below when the definition hasn't been dropped. Fixes PR42542. Reviewers: pcc Subscribers: mehdi_amini, inglorion, steven_wu, dexonsmith, dang, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66264 llvm-svn: 369766
2019-08-23 15:18:58 +00:00
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing) {
for (auto &I : Index)
2019-11-19 15:51:25 +03:00
thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported,
isPrevailing);
}
// Requires a destructor for std::vector<InputModule>.
InputFile::~InputFile() = default;
Expected<std::unique_ptr<InputFile>> InputFile::create(MemoryBufferRef Object) {
std::unique_ptr<InputFile> File(new InputFile);
Expected<IRSymtabFile> FOrErr = readIRSymtab(Object);
if (!FOrErr)
return FOrErr.takeError();
File->TargetTriple = FOrErr->TheReader.getTargetTriple();
File->SourceFileName = FOrErr->TheReader.getSourceFileName();
File->COFFLinkerOpts = FOrErr->TheReader.getCOFFLinkerOpts();
[ELF] Implement Dependent Libraries Feature This patch implements a limited form of autolinking primarily designed to allow either the --dependent-library compiler option, or "comment lib" pragmas ( https://docs.microsoft.com/en-us/cpp/preprocessor/comment-c-cpp?view=vs-2017) in C/C++ e.g. #pragma comment(lib, "foo"), to cause an ELF linker to automatically add the specified library to the link when processing the input file generated by the compiler. Currently this extension is unique to LLVM and LLD. However, care has been taken to design this feature so that it could be supported by other ELF linkers. The design goals were to provide: - A simple linking model for developers to reason about. - The ability to to override autolinking from the linker command line. - Source code compatibility, where possible, with "comment lib" pragmas in other environments (MSVC in particular). Dependent library support is implemented differently for ELF platforms than on the other platforms. Primarily this difference is that on ELF we pass the dependent library specifiers directly to the linker without manipulating them. This is in contrast to other platforms where they are mapped to a specific linker option by the compiler. This difference is a result of the greater variety of ELF linkers and the fact that ELF linkers tend to handle libraries in a more complicated fashion than on other platforms. This forces us to defer handling the specifiers to the linker. In order to achieve a level of source code compatibility with other platforms we have restricted this feature to work with libraries that meet the following "reasonable" requirements: 1. There are no competing defined symbols in a given set of libraries, or if they exist, the program owner doesn't care which is linked to their program. 2. There may be circular dependencies between libraries. The binary representation is a mergeable string section (SHF_MERGE, SHF_STRINGS), called .deplibs, with custom type SHT_LLVM_DEPENDENT_LIBRARIES (0x6fff4c04). The compiler forms this section by concatenating the arguments of the "comment lib" pragmas and --dependent-library options in the order they are encountered. Partial (-r, -Ur) links are handled by concatenating .deplibs sections with the normal mergeable string section rules. As an example, #pragma comment(lib, "foo") would result in: .section ".deplibs","MS",@llvm_dependent_libraries,1 .asciz "foo" For LTO, equivalent information to the contents of a the .deplibs section can be retrieved by the LLD for bitcode input files. LLD processes the dependent library specifiers in the following way: 1. Dependent libraries which are found from the specifiers in .deplibs sections of relocatable object files are added when the linker decides to include that file (which could itself be in a library) in the link. Dependent libraries behave as if they were appended to the command line after all other options. As a consequence the set of dependent libraries are searched last to resolve symbols. 2. It is an error if a file cannot be found for a given specifier. 3. Any command line options in effect at the end of the command line parsing apply to the dependent libraries, e.g. --whole-archive. 4. The linker tries to add a library or relocatable object file from each of the strings in a .deplibs section by; first, handling the string as if it was specified on the command line; second, by looking for the string in each of the library search paths in turn; third, by looking for a lib<string>.a or lib<string>.so (depending on the current mode of the linker) in each of the library search paths. 5. A new command line option --no-dependent-libraries tells LLD to ignore the dependent libraries. Rationale for the above points: 1. Adding the dependent libraries last makes the process simple to understand from a developers perspective. All linkers are able to implement this scheme. 2. Error-ing for libraries that are not found seems like better behavior than failing the link during symbol resolution. 3. It seems useful for the user to be able to apply command line options which will affect all of the dependent libraries. There is a potential problem of surprise for developers, who might not realize that these options would apply to these "invisible" input files; however, despite the potential for surprise, this is easy for developers to reason about and gives developers the control that they may require. 4. This algorithm takes into account all of the different ways that ELF linkers find input files. The different search methods are tried by the linker in most obvious to least obvious order. 5. I considered adding finer grained control over which dependent libraries were ignored (e.g. MSVC has /nodefaultlib:<library>); however, I concluded that this is not necessary: if finer control is required developers can fall back to using the command line directly. RFC thread: http://lists.llvm.org/pipermail/llvm-dev/2019-March/131004.html. Differential Revision: https://reviews.llvm.org/D60274 llvm-svn: 360984
2019-05-17 03:44:15 +00:00
File->DependentLibraries = FOrErr->TheReader.getDependentLibraries();
File->ComdatTable = FOrErr->TheReader.getComdatTable();
for (unsigned I = 0; I != FOrErr->Mods.size(); ++I) {
size_t Begin = File->Symbols.size();
for (const irsymtab::Reader::SymbolRef &Sym :
FOrErr->TheReader.module_symbols(I))
// Skip symbols that are irrelevant to LTO. Note that this condition needs
// to match the one in Skip() in LTO::addRegularLTO().
if (Sym.isGlobal() && !Sym.isFormatSpecific())
File->Symbols.push_back(Sym);
File->ModuleSymIndices.push_back({Begin, File->Symbols.size()});
}
File->Mods = FOrErr->Mods;
File->Strtab = std::move(FOrErr->Strtab);
return std::move(File);
}
StringRef InputFile::getName() const {
return Mods[0].getModuleIdentifier();
}
BitcodeModule &InputFile::getSingleBitcodeModule() {
assert(Mods.size() == 1 && "Expect only one bitcode module");
return Mods[0];
}
LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
const Config &Conf)
: ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
Mover(std::make_unique<IRMover>(*CombinedModule)) {
CombinedModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat;
}
LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam)
: Backend(std::move(BackendParam)), CombinedIndex(/*HaveGVs*/ false) {
if (!Backend.isValid())
Backend =
createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
}
LTO::LTO(Config Conf, ThinBackend Backend,
unsigned ParallelCodeGenParallelismLevel, LTOKind LTOMode)
: Conf(std::move(Conf)),
RegularLTO(ParallelCodeGenParallelismLevel, this->Conf),
ThinLTO(std::move(Backend)),
Re-apply "[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF" (#107792) Fix the use-after-free bug and re-apply https://github.com/llvm/llvm-project/pull/106193 * Without the fix, the string referenced by `objSym.Name` could be destroyed even if string saver keeps a copy of the referenced string. This caused use-after-free. * The fix ([latest commit](https://github.com/llvm/llvm-project/pull/107792/commits/9776ed44cfb26172480145aed8f59ba78a6fa2ea)) updates `objSym.Name` to reference (via `StringRef`) the string saver's copy. Test: 1. For `lld/test/ELF/lto/asmundef.ll`, its test failure is reproducible with `-DLLVM_USE_SANITIZER=Address` and gone with the fix. 3. Run all tests by following https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild#try-local-changes. * Without the fix, `ELF/lto/asmundef.ll` aborted the multi-stage test at `@@@BUILD_STEP stage2/asan_ubsan check@@@`, defined [here](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh#L30) * With the fix, the [multi-stage test](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh) pass stage2 {asan, ubsan, masan}. This is also the test used by https://lab.llvm.org/buildbot/#/builders/169 **Original commit message** `StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
2024-09-09 11:16:58 -07:00
GlobalResolutions(
std::make_unique<DenseMap<StringRef, GlobalResolution>>()),
LTOMode(LTOMode) {
if (Conf.KeepSymbolNameCopies || LTOKeepSymbolCopies) {
Alloc = std::make_unique<BumpPtrAllocator>();
GlobalResolutionSymbolSaver = std::make_unique<llvm::StringSaver>(*Alloc);
}
}
// Requires a destructor for MapVector<BitcodeModule>.
LTO::~LTO() = default;
// Add the symbols in the given module to the GlobalResolutions map, and resolve
// their partitions.
void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
ArrayRef<SymbolResolution> Res,
unsigned Partition, bool InSummary) {
auto *ResI = Res.begin();
auto *ResE = Res.end();
(void)ResE;
const Triple TT(RegularLTO.CombinedModule->getTargetTriple());
for (const InputFile::Symbol &Sym : Syms) {
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
Re-apply "[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF" (#107792) Fix the use-after-free bug and re-apply https://github.com/llvm/llvm-project/pull/106193 * Without the fix, the string referenced by `objSym.Name` could be destroyed even if string saver keeps a copy of the referenced string. This caused use-after-free. * The fix ([latest commit](https://github.com/llvm/llvm-project/pull/107792/commits/9776ed44cfb26172480145aed8f59ba78a6fa2ea)) updates `objSym.Name` to reference (via `StringRef`) the string saver's copy. Test: 1. For `lld/test/ELF/lto/asmundef.ll`, its test failure is reproducible with `-DLLVM_USE_SANITIZER=Address` and gone with the fix. 3. Run all tests by following https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild#try-local-changes. * Without the fix, `ELF/lto/asmundef.ll` aborted the multi-stage test at `@@@BUILD_STEP stage2/asan_ubsan check@@@`, defined [here](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh#L30) * With the fix, the [multi-stage test](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh) pass stage2 {asan, ubsan, masan}. This is also the test used by https://lab.llvm.org/buildbot/#/builders/169 **Original commit message** `StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
2024-09-09 11:16:58 -07:00
StringRef SymbolName = Sym.getName();
// Keep copies of symbols if the client of LTO says so.
if (GlobalResolutionSymbolSaver && !GlobalResolutions->contains(SymbolName))
SymbolName = GlobalResolutionSymbolSaver->save(SymbolName);
auto &GlobalRes = (*GlobalResolutions)[SymbolName];
GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
if (Res.Prevailing) {
assert(!GlobalRes.Prevailing &&
"Multiple prevailing defs are not allowed");
GlobalRes.Prevailing = true;
GlobalRes.IRName = std::string(Sym.getIRName());
} else if (!GlobalRes.Prevailing && GlobalRes.IRName.empty()) {
// Sometimes it can be two copies of symbol in a module and prevailing
// symbol can have no IR name. That might happen if symbol is defined in
// module level inline asm block. In case we have multiple modules with
// the same symbol we want to use IR name of the prevailing symbol.
// Otherwise, if we haven't seen a prevailing symbol, set the name so that
// we can later use it to check if there is any prevailing copy in IR.
GlobalRes.IRName = std::string(Sym.getIRName());
}
// In rare occasion, the symbol used to initialize GlobalRes has a different
// IRName from the inspected Symbol. This can happen on macOS + iOS, when a
// symbol is referenced through its mangled name, say @"\01_symbol" while
// the IRName is @symbol (the prefix underscore comes from MachO mangling).
// In that case, we have the same actual Symbol that can get two different
// GUID, leading to some invalid internalization. Workaround this by marking
// the GlobalRes external.
// FIXME: instead of this check, it would be desirable to compute GUIDs
// based on mangled name, but this requires an access to the Target Triple
// and would be relatively invasive on the codebase.
if (GlobalRes.IRName != Sym.getIRName()) {
GlobalRes.Partition = GlobalResolution::External;
GlobalRes.VisibleOutsideSummary = true;
}
// Set the partition to external if we know it is re-defined by the linker
// with -defsym or -wrap options, used elsewhere, e.g. it is visible to a
// regular object, is referenced from llvm.compiler.used/llvm.used, or was
// already recorded as being referenced from a different partition.
if (Res.LinkerRedefined || Res.VisibleToRegularObj || Sym.isUsed() ||
(GlobalRes.Partition != GlobalResolution::Unknown &&
GlobalRes.Partition != Partition)) {
GlobalRes.Partition = GlobalResolution::External;
} else
// First recorded reference, save the current partition.
GlobalRes.Partition = Partition;
// Flag as visible outside of summary if visible from a regular object or
// from a module that does not have a summary.
GlobalRes.VisibleOutsideSummary |=
(Res.VisibleToRegularObj || Sym.isUsed() || !InSummary);
GlobalRes.ExportDynamic |= Res.ExportDynamic;
}
}
Re-apply "[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF" (#107792) Fix the use-after-free bug and re-apply https://github.com/llvm/llvm-project/pull/106193 * Without the fix, the string referenced by `objSym.Name` could be destroyed even if string saver keeps a copy of the referenced string. This caused use-after-free. * The fix ([latest commit](https://github.com/llvm/llvm-project/pull/107792/commits/9776ed44cfb26172480145aed8f59ba78a6fa2ea)) updates `objSym.Name` to reference (via `StringRef`) the string saver's copy. Test: 1. For `lld/test/ELF/lto/asmundef.ll`, its test failure is reproducible with `-DLLVM_USE_SANITIZER=Address` and gone with the fix. 3. Run all tests by following https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild#try-local-changes. * Without the fix, `ELF/lto/asmundef.ll` aborted the multi-stage test at `@@@BUILD_STEP stage2/asan_ubsan check@@@`, defined [here](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh#L30) * With the fix, the [multi-stage test](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh) pass stage2 {asan, ubsan, masan}. This is also the test used by https://lab.llvm.org/buildbot/#/builders/169 **Original commit message** `StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
2024-09-09 11:16:58 -07:00
void LTO::releaseGlobalResolutionsMemory() {
// Release GlobalResolutions dense-map itself.
GlobalResolutions.reset();
// Release the string saver memory.
GlobalResolutionSymbolSaver.reset();
Alloc.reset();
}
static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
ArrayRef<SymbolResolution> Res) {
StringRef Path = Input->getName();
OS << Path << '\n';
auto ResI = Res.begin();
for (const InputFile::Symbol &Sym : Input->symbols()) {
assert(ResI != Res.end());
SymbolResolution Res = *ResI++;
OS << "-r=" << Path << ',' << Sym.getName() << ',';
if (Res.Prevailing)
OS << 'p';
if (Res.FinalDefinitionInLinkageUnit)
OS << 'l';
if (Res.VisibleToRegularObj)
OS << 'x';
if (Res.LinkerRedefined)
OS << 'r';
OS << '\n';
}
OS.flush();
assert(ResI == Res.end());
}
Error LTO::add(std::unique_ptr<InputFile> Input,
ArrayRef<SymbolResolution> Res) {
assert(!CalledGetMaxTasks);
if (Conf.ResolutionFile)
writeToResolutionFile(*Conf.ResolutionFile, Input.get(), Res);
if (RegularLTO.CombinedModule->getTargetTriple().empty()) {
Triple InputTriple(Input->getTargetTriple());
RegularLTO.CombinedModule->setTargetTriple(InputTriple);
if (InputTriple.isOSBinFormatELF())
Conf.VisibilityScheme = Config::ELF;
}
const SymbolResolution *ResI = Res.begin();
for (unsigned I = 0; I != Input->Mods.size(); ++I)
if (Error Err = addModule(*Input, I, ResI, Res.end()))
return Err;
assert(ResI == Res.end());
return Error::success();
}
Error LTO::addModule(InputFile &Input, unsigned ModI,
const SymbolResolution *&ResI,
const SymbolResolution *ResE) {
Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
if (!LTOInfo)
return LTOInfo.takeError();
if (EnableSplitLTOUnit) {
// If only some modules were split, flag this in the index so that
// we can skip or error on optimizations that need consistently split
// modules (whole program devirt and lower type tests).
if (*EnableSplitLTOUnit != LTOInfo->EnableSplitLTOUnit)
ThinLTO.CombinedIndex.setPartiallySplitLTOUnits();
} else
EnableSplitLTOUnit = LTOInfo->EnableSplitLTOUnit;
BitcodeModule BM = Input.Mods[ModI];
if ((LTOMode == LTOK_UnifiedRegular || LTOMode == LTOK_UnifiedThin) &&
!LTOInfo->UnifiedLTO)
return make_error<StringError>(
"unified LTO compilation must use "
"compatible bitcode modules (use -funified-lto)",
inconvertibleErrorCode());
if (LTOInfo->UnifiedLTO && LTOMode == LTOK_Default)
LTOMode = LTOK_UnifiedThin;
bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular);
auto ModSyms = Input.module_symbols(ModI);
addModuleToGlobalRes(ModSyms, {ResI, ResE},
IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0,
LTOInfo->HasSummary);
if (IsThinLTO)
return addThinLTO(BM, ModSyms, ResI, ResE);
RegularLTO.EmptyCombinedModule = false;
Expected<RegularLTOState::AddedModule> ModOrErr =
addRegularLTO(BM, ModSyms, ResI, ResE);
if (!ModOrErr)
return ModOrErr.takeError();
if (!LTOInfo->HasSummary)
return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false);
// Regular LTO module summaries are added to a dummy module that represents
// the combined regular LTO module.
if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, ""))
return Err;
RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr));
return Error::success();
}
// Checks whether the given global value is in a non-prevailing comdat
// (comdat containing values the linker indicated were not prevailing,
// which we then dropped to available_externally), and if so, removes
// it from the comdat. This is called for all global values to ensure the
// comdat is empty rather than leaving an incomplete comdat. It is needed for
// regular LTO modules, in case we are in a mixed-LTO mode (both regular
// and thin LTO modules) compilation. Since the regular LTO module will be
// linked first in the final native link, we want to make sure the linker
// doesn't select any of these incomplete comdats that would be left
// in the regular LTO module without this cleanup.
static void
handleNonPrevailingComdat(GlobalValue &GV,
std::set<const Comdat *> &NonPrevailingComdats) {
Comdat *C = GV.getComdat();
if (!C)
return;
if (!NonPrevailingComdats.count(C))
return;
[LTO] Make local linkage GlobalValue in non-prevailing COMDAT available_externally For a local linkage GlobalObject in a non-prevailing COMDAT, it remains defined while its leader has been made available_externally. This violates the COMDAT rule that its members must be retained or discarded as a unit. To fix this, update the regular LTO change D34803 to track local linkage GlobalValues, and port the code to ThinLTO (GlobalAliases are not handled.) This fixes two problems. (a) `__cxx_global_var_init` in a non-prevailing COMDAT group used to linger around (unreferenced, hence benign), and is now correctly discarded. ``` int foo(); inline int v = foo(); ``` (b) Fix https://github.com/llvm/llvm-project/issues/58215: as a size optimization, we place private `__profd_` in a COMDAT with a `__profc_` key. When FuncImport.cpp makes `__profc_` available_externally due to a non-prevailing COMDAT, `__profd_` incorrectly remains private. This change makes the `__profd_` available_externally. ``` cat > c.h <<'eof' extern void bar(); inline __attribute__((noinline)) void foo() {} eof cat > m1.cc <<'eof' #include "c.h" int main() { bar(); foo(); } eof cat > m2.cc <<'eof' #include "c.h" __attribute__((noinline)) void bar() { foo(); } eof clang -O2 -fprofile-generate=./t m1.cc m2.cc -flto -fuse-ld=lld -o t_gen rm -fr t && ./t_gen && llvm-profdata show -function=foo t/default_*.profraw clang -O2 -fprofile-generate=./t m1.cc m2.cc -flto=thin -fuse-ld=lld -o t_gen rm -fr t && ./t_gen && llvm-profdata show -function=foo t/default_*.profraw ``` If a GlobalAlias references a GlobalValue which is just changed to available_externally, change the GlobalAlias as well (e.g. C5/D5 comdats due to cc1 -mconstructor-aliases). The GlobalAlias may be referenced by other available_externally functions, so it cannot easily be removed. Depends on D137441: we use available_externally to mark a GlobalAlias in a non-prevailing COMDAT, similar to how we handle GlobalVariable/Function. GlobalAlias may refer to a ConstantExpr, not changing GlobalAlias to GlobalVariable gives flexibility for future extensions (the use case is niche. For simplicity we don't handle it yet). In addition, available_externally GlobalAlias is the most straightforward implementation and retains the aliasee information to help optimizers. See windows-vftable.ll: Windows vftable uses an alias pointing to a private constant where the alias is the COMDAT leader. The COMDAT use case is skeptical and ThinLTO does not discard the alias in the non-prevailing COMDAT. This patch retains the behavior. See new tests ctor-dtor-alias2.ll: depending on whether the complete object destructor emitted, when ctor/dtor aliases are used, we may see D0/D2 COMDATs in one TU and D0/D1/D2 in a D5 COMDAT in another TU. Allow such a mix-and-match with `if (GO->getComdat()->getName() == GO->getName()) NonPrevailingComdats.insert(GO->getComdat());` GlobalAlias handling in ThinLTO is still weird, but this patch should hopefully improve the situation for at least all cases I can think of. Reviewed By: tejohnson Differential Revision: https://reviews.llvm.org/D135427
2022-11-16 22:13:22 -08:00
// Additionally need to drop all global values from the comdat to
// available_externally, to satisfy the COMDAT requirement that all members
// are discarded as a unit. The non-local linkage global values avoid
// duplicate definition linker errors.
GV.setLinkage(GlobalValue::AvailableExternallyLinkage);
if (auto GO = dyn_cast<GlobalObject>(&GV))
GO->setComdat(nullptr);
}
// Add a regular LTO object to the link.
// The resulting module needs to be linked into the combined LTO module with
// linkRegularLTO.
Expected<LTO::RegularLTOState::AddedModule>
LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
const SymbolResolution *&ResI,
const SymbolResolution *ResE) {
RegularLTOState::AddedModule Mod;
Expected<std::unique_ptr<Module>> MOrErr =
BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true,
/*IsImporting*/ false);
if (!MOrErr)
return MOrErr.takeError();
Module &M = **MOrErr;
Mod.M = std::move(*MOrErr);
if (Error Err = M.materializeMetadata())
return std::move(Err);
// If cfi.functions is present and we are in regular LTO mode, LowerTypeTests
// will rename local functions in the merged module as "<function name>.1".
// This causes linking errors, since other parts of the module expect the
// original function name.
if (LTOMode == LTOK_UnifiedRegular)
if (NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"))
M.eraseNamedMetadata(CfiFunctionsMD);
UpgradeDebugInfo(M);
ModuleSymbolTable SymTab;
SymTab.addModule(&M);
for (GlobalVariable &GV : M.globals())
if (GV.hasAppendingLinkage())
Mod.Keep.push_back(&GV);
DenseSet<GlobalObject *> AliasedGlobals;
for (auto &GA : M.aliases())
if (GlobalObject *GO = GA.getAliaseeObject())
AliasedGlobals.insert(GO);
// In this function we need IR GlobalValues matching the symbols in Syms
// (which is not backed by a module), so we need to enumerate them in the same
// order. The symbol enumeration order of a ModuleSymbolTable intentionally
// matches the order of an irsymtab, but when we read the irsymtab in
// InputFile::create we omit some symbols that are irrelevant to LTO. The
// Skip() function skips the same symbols from the module as InputFile does
// from the symbol table.
auto MsymI = SymTab.symbols().begin(), MsymE = SymTab.symbols().end();
auto Skip = [&]() {
while (MsymI != MsymE) {
auto Flags = SymTab.getSymbolFlags(*MsymI);
if ((Flags & object::BasicSymbolRef::SF_Global) &&
!(Flags & object::BasicSymbolRef::SF_FormatSpecific))
return;
++MsymI;
}
};
Skip();
std::set<const Comdat *> NonPrevailingComdats;
SmallSet<StringRef, 2> NonPrevailingAsmSymbols;
for (const InputFile::Symbol &Sym : Syms) {
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
assert(MsymI != MsymE);
ModuleSymbolTable::Symbol Msym = *MsymI++;
Skip();
if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) {
if (Res.Prevailing) {
if (Sym.isUndefined())
continue;
Mod.Keep.push_back(GV);
// For symbols re-defined with linker -wrap and -defsym options,
// set the linkage to weak to inhibit IPO. The linkage will be
// restored by the linker.
if (Res.LinkerRedefined)
GV->setLinkage(GlobalValue::WeakAnyLinkage);
GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
if (GlobalValue::isLinkOnceLinkage(OriginalLinkage))
GV->setLinkage(GlobalValue::getWeakLinkage(
GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
} else if (isa<GlobalObject>(GV) &&
(GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() ||
GV->hasAvailableExternallyLinkage()) &&
!AliasedGlobals.count(cast<GlobalObject>(GV))) {
// Any of the above three types of linkage indicates that the
// chosen prevailing symbol will have the same semantics as this copy of
// the symbol, so we may be able to link it with available_externally
// linkage. We will decide later whether to do that when we link this
// module (in linkRegularLTO), based on whether it is undefined.
Mod.Keep.push_back(GV);
GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
if (GV->hasComdat())
NonPrevailingComdats.insert(GV->getComdat());
cast<GlobalObject>(GV)->setComdat(nullptr);
}
// Set the 'local' flag based on the linker resolution for this symbol.
if (Res.FinalDefinitionInLinkageUnit) {
GV->setDSOLocal(true);
if (GV->hasDLLImportStorageClass())
GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::
DefaultStorageClass);
}
} else if (auto *AS =
dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) {
// Collect non-prevailing symbols.
if (!Res.Prevailing)
NonPrevailingAsmSymbols.insert(AS->first);
} else {
llvm_unreachable("unknown symbol type");
}
// Common resolution: collect the maximum size/alignment over all commons.
// We also record if we see an instance of a common as prevailing, so that
// if none is prevailing we can ignore it later.
if (Sym.isCommon()) {
// FIXME: We should figure out what to do about commons defined by asm.
// For now they aren't reported correctly by ModuleSymbolTable.
auto &CommonRes = RegularLTO.Commons[std::string(Sym.getIRName())];
CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
if (uint32_t SymAlignValue = Sym.getCommonAlignment()) {
[reland][NFC] Transition GlobalObject alignment from MaybeAlign to Align This is a follow up on https://reviews.llvm.org/D142459#4081179. This first patch adds an overload to `GlobalObject::setAlignment` that accepts an `Align` type. This already handles most of the calls. This patch also converts a few call sites to the new type when this is safe. Here is the list of the remaining call sites: - [clang/lib/CodeGen/CodeGenModule.cpp:1688](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/clang/lib/CodeGen/CodeGenModule.cpp#L1688) - [llvm/lib/AsmParser/LLParser.cpp:1309](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L1309) - [llvm/lib/AsmParser/LLParser.cpp:6050](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L6050) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:3871](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L3871) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:4030](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L4030) - [llvm/lib/IR/Core.cpp:2018](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Core.cpp#L2018) - [llvm/lib/IR/Globals.cpp:141](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Globals.cpp#L141) - [llvm/lib/Linker/IRMover.cpp:660](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/IRMover.cpp#L660) - [llvm/lib/Linker/LinkModules.cpp:361](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L361) - [llvm/lib/Linker/LinkModules.cpp:362](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L362) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:782](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L782) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:840](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L840) - [llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp:1813](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp#L1813) - [llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp:27](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp#L27) Differential Revision: https://reviews.llvm.org/D142708
2023-01-31 15:02:46 +00:00
CommonRes.Alignment =
std::max(Align(SymAlignValue), CommonRes.Alignment);
}
CommonRes.Prevailing |= Res.Prevailing;
}
}
if (!M.getComdatSymbolTable().empty())
for (GlobalValue &GV : M.global_values())
handleNonPrevailingComdat(GV, NonPrevailingComdats);
// Prepend ".lto_discard <sym>, <sym>*" directive to each module inline asm
// block.
if (!M.getModuleInlineAsm().empty()) {
std::string NewIA = ".lto_discard";
if (!NonPrevailingAsmSymbols.empty()) {
// Don't dicard a symbol if there is a live .symver for it.
ModuleSymbolTable::CollectAsmSymvers(
M, [&](StringRef Name, StringRef Alias) {
if (!NonPrevailingAsmSymbols.count(Alias))
NonPrevailingAsmSymbols.erase(Name);
});
NewIA += " " + llvm::join(NonPrevailingAsmSymbols, ", ");
}
NewIA += "\n";
M.setModuleInlineAsm(NewIA + M.getModuleInlineAsm());
}
assert(MsymI == MsymE);
return std::move(Mod);
}
Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
bool LivenessFromIndex) {
std::vector<GlobalValue *> Keep;
for (GlobalValue *GV : Mod.Keep) {
if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) {
if (Function *F = dyn_cast<Function>(GV)) {
if (DiagnosticOutputFile) {
if (Error Err = F->materialize())
return Err;
OptimizationRemarkEmitter ORE(F, nullptr);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F)
<< ore::NV("Function", F)
<< " not added to the combined module ");
}
}
continue;
}
if (!GV->hasAvailableExternallyLinkage()) {
Keep.push_back(GV);
continue;
}
// Only link available_externally definitions if we don't already have a
// definition.
GlobalValue *CombinedGV =
RegularLTO.CombinedModule->getNamedValue(GV->getName());
if (CombinedGV && !CombinedGV->isDeclaration())
continue;
Keep.push_back(GV);
}
return RegularLTO.Mover->move(std::move(Mod.M), Keep, nullptr,
/* IsPerformingImport */ false);
}
// Add a ThinLTO module to the link.
Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
const SymbolResolution *&ResI,
const SymbolResolution *ResE) {
const SymbolResolution *ResITmp = ResI;
for (const InputFile::Symbol &Sym : Syms) {
assert(ResITmp != ResE);
SymbolResolution Res = *ResITmp++;
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
if (Res.Prevailing)
ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
}
}
if (Error Err =
BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
[&](GlobalValue::GUID GUID) {
return ThinLTO.PrevailingModuleForGUID[GUID] ==
BM.getModuleIdentifier();
}))
return Err;
LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
for (const InputFile::Symbol &Sym : Syms) {
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
if (Res.Prevailing) {
assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
BM.getModuleIdentifier());
// For linker redefined symbols (via --wrap or --defsym) we want to
// switch the linkage to `weak` to prevent IPOs from happening.
// Find the summary in the module for this very GV and record the new
// linkage so that we can switch it when we import the GV.
if (Res.LinkerRedefined)
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier()))
S->setLinkage(GlobalValue::WeakAnyLinkage);
}
// If the linker resolved the symbol to a local definition then mark it
// as local in the summary for the module we are adding.
if (Res.FinalDefinitionInLinkageUnit) {
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier())) {
S->setDSOLocal(true);
}
}
}
}
if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
return make_error<StringError>(
"Expected at most one ThinLTO module per bitcode file",
inconvertibleErrorCode());
if (!Conf.ThinLTOModulesToCompile.empty()) {
if (!ThinLTO.ModulesToCompile)
ThinLTO.ModulesToCompile = ModuleMapType();
// This is a fuzzy name matching where only modules with name containing the
// specified switch values are going to be compiled.
for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
if (BM.getModuleIdentifier().contains(Name)) {
ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
<< " to compile\n");
}
}
}
return Error::success();
}
unsigned LTO::getMaxTasks() const {
CalledGetMaxTasks = true;
auto ModuleCount = ThinLTO.ModulesToCompile ? ThinLTO.ModulesToCompile->size()
: ThinLTO.ModuleMap.size();
return RegularLTO.ParallelCodeGenParallelismLevel + ModuleCount;
}
// If only some of the modules were split, we cannot correctly handle
// code that contains type tests or type checked loads.
Error LTO::checkPartiallySplit() {
if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits())
return Error::success();
const Module *Combined = RegularLTO.CombinedModule.get();
Function *TypeTestFunc =
Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_test);
Function *TypeCheckedLoadFunc =
Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_checked_load);
Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
Combined, Intrinsic::type_checked_load_relative);
// First check if there are type tests / type checked loads in the
// merged regular LTO module IR.
if ((TypeTestFunc && !TypeTestFunc->use_empty()) ||
(TypeCheckedLoadFunc && !TypeCheckedLoadFunc->use_empty()) ||
(TypeCheckedLoadRelativeFunc &&
!TypeCheckedLoadRelativeFunc->use_empty()))
return make_error<StringError>(
"inconsistent LTO Unit splitting (recompile with -fsplit-lto-unit)",
inconvertibleErrorCode());
// Otherwise check if there are any recorded in the combined summary from the
// ThinLTO modules.
for (auto &P : ThinLTO.CombinedIndex) {
for (auto &S : P.second.SummaryList) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
if (!FS->type_test_assume_vcalls().empty() ||
!FS->type_checked_load_vcalls().empty() ||
!FS->type_test_assume_const_vcalls().empty() ||
!FS->type_checked_load_const_vcalls().empty() ||
!FS->type_tests().empty())
return make_error<StringError>(
"inconsistent LTO Unit splitting (recompile with -fsplit-lto-unit)",
inconvertibleErrorCode());
}
}
return Error::success();
}
Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
// Compute "dead" symbols, we don't want to import/export these!
DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
DenseMap<GlobalValue::GUID, PrevailingType> GUIDPrevailingResolutions;
for (auto &Res : *GlobalResolutions) {
// Normally resolution have IR name of symbol. We can do nothing here
// otherwise. See comments in GlobalResolution struct for more details.
if (Res.second.IRName.empty())
continue;
GlobalValue::GUID GUID = GlobalValue::getGUID(
GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
if (Res.second.VisibleOutsideSummary && Res.second.Prevailing)
GUIDPreservedSymbols.insert(GUID);
if (Res.second.ExportDynamic)
DynamicExportSymbols.insert(GUID);
GUIDPrevailingResolutions[GUID] =
Res.second.Prevailing ? PrevailingType::Yes : PrevailingType::No;
}
auto isPrevailing = [&](GlobalValue::GUID G) {
auto It = GUIDPrevailingResolutions.find(G);
if (It == GUIDPrevailingResolutions.end())
return PrevailingType::Unknown;
return It->second;
};
computeDeadSymbolsWithConstProp(ThinLTO.CombinedIndex, GUIDPreservedSymbols,
isPrevailing, Conf.OptLevel > 0);
// Setup output file to emit statistics.
auto StatsFileOrErr = setupStatsFile(Conf.StatsFile);
if (!StatsFileOrErr)
return StatsFileOrErr.takeError();
std::unique_ptr<ToolOutputFile> StatsFile = std::move(StatsFileOrErr.get());
[MemProf] Control availability of hot/cold operator new from LTO link Adds an LTO option to indicate that whether we are linking with an allocator that supports hot/cold operator new interfaces. If not, at the start of the LTO backends any existing memprof hot/cold attributes are removed from the IR, and we also remove memprof metadata so that post-LTO inlining doesn't add any new attributes. This is done via setting a new flag in the module summary index. It is important to communicate via the index to the LTO backends so that distributed ThinLTO handles this correctly, as they are invoked by separate clang processes and the combined index is how we communicate information from the LTO link. Specifically, for distributed ThinLTO the LTO related processes look like: ``` # Thin link: $ lld --thinlto-index-only obj1.o ... objN.o -llib ... # ThinLTO backends: $ clang -x ir obj1.o -fthinlto-index=obj1.o.thinlto.bc -c -O2 ... $ clang -x ir objN.o -fthinlto-index=objN.o.thinlto.bc -c -O2 ``` It is during the thin link (lld --thinlto-index-only) that we have visibility into linker dependences and want to be able to pass the new option via -Wl,-supports-hot-cold-new. This will be recorded in the summary indexes created for the distributed backend processes (*.thinlto.bc) and queried from there, so that we don't need to know during those individual clang backends what allocation library was linked. Since in-process ThinLTO and regular LTO also use a combined index, for consistency we query the flag out of the index in all LTO backends. Additionally, when the LTO option is disabled, exit early from the MemProfContextDisambiguation handling performed during LTO, as this is unnecessary. Depends on D149117 and D149192. Differential Revision: https://reviews.llvm.org/D149215
2023-05-06 07:14:56 -07:00
// TODO: Ideally this would be controlled automatically by detecting that we
// are linking with an allocator that supports these interfaces, rather than
// an internal option (which would still be needed for tests, however). For
// example, if the library exported a symbol like __malloc_hot_cold the linker
// could recognize that and set a flag in the lto::Config.
if (SupportsHotColdNew)
ThinLTO.CombinedIndex.setWithSupportsHotColdNew();
Error Result = runRegularLTO(AddStream);
if (!Result)
// This will reset the GlobalResolutions optional once done with it to
// reduce peak memory before importing.
Result = runThinLTO(AddStream, Cache, GUIDPreservedSymbols);
if (StatsFile)
PrintStatisticsJSON(StatsFile->os());
return Result;
}
[MemProf] Control availability of hot/cold operator new from LTO link Adds an LTO option to indicate that whether we are linking with an allocator that supports hot/cold operator new interfaces. If not, at the start of the LTO backends any existing memprof hot/cold attributes are removed from the IR, and we also remove memprof metadata so that post-LTO inlining doesn't add any new attributes. This is done via setting a new flag in the module summary index. It is important to communicate via the index to the LTO backends so that distributed ThinLTO handles this correctly, as they are invoked by separate clang processes and the combined index is how we communicate information from the LTO link. Specifically, for distributed ThinLTO the LTO related processes look like: ``` # Thin link: $ lld --thinlto-index-only obj1.o ... objN.o -llib ... # ThinLTO backends: $ clang -x ir obj1.o -fthinlto-index=obj1.o.thinlto.bc -c -O2 ... $ clang -x ir objN.o -fthinlto-index=objN.o.thinlto.bc -c -O2 ``` It is during the thin link (lld --thinlto-index-only) that we have visibility into linker dependences and want to be able to pass the new option via -Wl,-supports-hot-cold-new. This will be recorded in the summary indexes created for the distributed backend processes (*.thinlto.bc) and queried from there, so that we don't need to know during those individual clang backends what allocation library was linked. Since in-process ThinLTO and regular LTO also use a combined index, for consistency we query the flag out of the index in all LTO backends. Additionally, when the LTO option is disabled, exit early from the MemProfContextDisambiguation handling performed during LTO, as this is unnecessary. Depends on D149117 and D149192. Differential Revision: https://reviews.llvm.org/D149215
2023-05-06 07:14:56 -07:00
void lto::updateMemProfAttributes(Module &Mod,
const ModuleSummaryIndex &Index) {
if (Index.withSupportsHotColdNew())
return;
// The profile matcher applies hotness attributes directly for allocations,
// and those will cause us to generate calls to the hot/cold interfaces
// unconditionally. If supports-hot-cold-new was not enabled in the LTO
// link then assume we don't want these calls (e.g. not linking with
// the appropriate library, or otherwise trying to disable this behavior).
for (auto &F : Mod) {
for (auto &BB : F) {
for (auto &I : BB) {
auto *CI = dyn_cast<CallBase>(&I);
if (!CI)
continue;
if (CI->hasFnAttr("memprof"))
CI->removeFnAttr("memprof");
// Strip off all memprof metadata as it is no longer needed.
// Importantly, this avoids the addition of new memprof attributes
// after inlining propagation.
// TODO: If we support additional types of MemProf metadata beyond hot
// and cold, we will need to update the metadata based on the allocator
// APIs supported instead of completely stripping all.
CI->setMetadata(LLVMContext::MD_memprof, nullptr);
CI->setMetadata(LLVMContext::MD_callsite, nullptr);
}
}
}
}
Error LTO::runRegularLTO(AddStreamFn AddStream) {
// Setup optimization remarks.
auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename,
Conf.RemarksPasses, Conf.RemarksFormat, Conf.RemarksWithHotness,
Conf.RemarksHotnessThreshold);
LLVM_DEBUG(dbgs() << "Running regular LTO\n");
if (!DiagFileOrErr)
return DiagFileOrErr.takeError();
DiagnosticOutputFile = std::move(*DiagFileOrErr);
// Finalize linking of regular LTO modules containing summaries now that
// we have computed liveness information.
for (auto &M : RegularLTO.ModsWithSummaries)
if (Error Err = linkRegularLTO(std::move(M),
/*LivenessFromIndex=*/true))
return Err;
// Ensure we don't have inconsistently split LTO units with type tests.
// FIXME: this checks both LTO and ThinLTO. It happens to work as we take
// this path both cases but eventually this should be split into two and
// do the ThinLTO checks in `runThinLTO`.
if (Error Err = checkPartiallySplit())
return Err;
// Make sure commons have the right size/alignment: we kept the largest from
// all the prevailing when adding the inputs, and we apply it here.
const DataLayout &DL = RegularLTO.CombinedModule->getDataLayout();
for (auto &I : RegularLTO.Commons) {
if (!I.second.Prevailing)
// Don't do anything if no instance of this common was prevailing.
continue;
GlobalVariable *OldGV = RegularLTO.CombinedModule->getNamedGlobal(I.first);
if (OldGV && DL.getTypeAllocSize(OldGV->getValueType()) == I.second.Size) {
// Don't create a new global if the type is already correct, just make
// sure the alignment is correct.
[reland][NFC] Transition GlobalObject alignment from MaybeAlign to Align This is a follow up on https://reviews.llvm.org/D142459#4081179. This first patch adds an overload to `GlobalObject::setAlignment` that accepts an `Align` type. This already handles most of the calls. This patch also converts a few call sites to the new type when this is safe. Here is the list of the remaining call sites: - [clang/lib/CodeGen/CodeGenModule.cpp:1688](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/clang/lib/CodeGen/CodeGenModule.cpp#L1688) - [llvm/lib/AsmParser/LLParser.cpp:1309](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L1309) - [llvm/lib/AsmParser/LLParser.cpp:6050](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L6050) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:3871](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L3871) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:4030](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L4030) - [llvm/lib/IR/Core.cpp:2018](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Core.cpp#L2018) - [llvm/lib/IR/Globals.cpp:141](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Globals.cpp#L141) - [llvm/lib/Linker/IRMover.cpp:660](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/IRMover.cpp#L660) - [llvm/lib/Linker/LinkModules.cpp:361](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L361) - [llvm/lib/Linker/LinkModules.cpp:362](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L362) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:782](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L782) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:840](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L840) - [llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp:1813](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp#L1813) - [llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp:27](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp#L27) Differential Revision: https://reviews.llvm.org/D142708
2023-01-31 15:02:46 +00:00
OldGV->setAlignment(I.second.Alignment);
continue;
}
ArrayType *Ty =
ArrayType::get(Type::getInt8Ty(RegularLTO.Ctx), I.second.Size);
auto *GV = new GlobalVariable(*RegularLTO.CombinedModule, Ty, false,
GlobalValue::CommonLinkage,
ConstantAggregateZero::get(Ty), "");
[reland][NFC] Transition GlobalObject alignment from MaybeAlign to Align This is a follow up on https://reviews.llvm.org/D142459#4081179. This first patch adds an overload to `GlobalObject::setAlignment` that accepts an `Align` type. This already handles most of the calls. This patch also converts a few call sites to the new type when this is safe. Here is the list of the remaining call sites: - [clang/lib/CodeGen/CodeGenModule.cpp:1688](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/clang/lib/CodeGen/CodeGenModule.cpp#L1688) - [llvm/lib/AsmParser/LLParser.cpp:1309](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L1309) - [llvm/lib/AsmParser/LLParser.cpp:6050](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/AsmParser/LLParser.cpp#L6050) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:3871](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L3871) - [llvm/lib/Bitcode/Reader/BitcodeReader.cpp:4030](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Bitcode/Reader/BitcodeReader.cpp#L4030) - [llvm/lib/IR/Core.cpp:2018](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Core.cpp#L2018) - [llvm/lib/IR/Globals.cpp:141](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/IR/Globals.cpp#L141) - [llvm/lib/Linker/IRMover.cpp:660](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/IRMover.cpp#L660) - [llvm/lib/Linker/LinkModules.cpp:361](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L361) - [llvm/lib/Linker/LinkModules.cpp:362](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Linker/LinkModules.cpp#L362) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:782](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L782) - [llvm/lib/Transforms/IPO/MergeFunctions.cpp:840](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/MergeFunctions.cpp#L840) - [llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp:1813](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp#L1813) - [llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp:27](https://github.com/llvm/llvm-project/blob/e195e6bad6706230a4b5fd4b5cc13de1f16f25cc/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp#L27) Differential Revision: https://reviews.llvm.org/D142708
2023-01-31 15:02:46 +00:00
GV->setAlignment(I.second.Alignment);
if (OldGV) {
OldGV->replaceAllUsesWith(GV);
GV->takeName(OldGV);
OldGV->eraseFromParent();
} else {
GV->setName(I.first);
}
}
[MemProf] Control availability of hot/cold operator new from LTO link Adds an LTO option to indicate that whether we are linking with an allocator that supports hot/cold operator new interfaces. If not, at the start of the LTO backends any existing memprof hot/cold attributes are removed from the IR, and we also remove memprof metadata so that post-LTO inlining doesn't add any new attributes. This is done via setting a new flag in the module summary index. It is important to communicate via the index to the LTO backends so that distributed ThinLTO handles this correctly, as they are invoked by separate clang processes and the combined index is how we communicate information from the LTO link. Specifically, for distributed ThinLTO the LTO related processes look like: ``` # Thin link: $ lld --thinlto-index-only obj1.o ... objN.o -llib ... # ThinLTO backends: $ clang -x ir obj1.o -fthinlto-index=obj1.o.thinlto.bc -c -O2 ... $ clang -x ir objN.o -fthinlto-index=objN.o.thinlto.bc -c -O2 ``` It is during the thin link (lld --thinlto-index-only) that we have visibility into linker dependences and want to be able to pass the new option via -Wl,-supports-hot-cold-new. This will be recorded in the summary indexes created for the distributed backend processes (*.thinlto.bc) and queried from there, so that we don't need to know during those individual clang backends what allocation library was linked. Since in-process ThinLTO and regular LTO also use a combined index, for consistency we query the flag out of the index in all LTO backends. Additionally, when the LTO option is disabled, exit early from the MemProfContextDisambiguation handling performed during LTO, as this is unnecessary. Depends on D149117 and D149192. Differential Revision: https://reviews.llvm.org/D149215
2023-05-06 07:14:56 -07:00
updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
bool WholeProgramVisibilityEnabledInLTO =
Conf.HasWholeProgramVisibility &&
// If validation is enabled, upgrade visibility only when all vtables
// have typeinfos.
(!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
// This returns true when the name is local or not defined. Locals are
// expected to be handled separately.
auto IsVisibleToRegularObj = [&](StringRef name) {
auto It = GlobalResolutions->find(name);
[LTO][WPD] Suppress WPD on a class if the LTO unit doesn't have the prevailing definition of this class (#131721) Before this patch, whole program devirtualization is suppressed on a class if any superclass is visible to regular object files, by recording the class GUID in `VisibleToRegularObjSymbols`. This patch suppresses whole program devirtualization on a class if the LTO unit doesn't have the prevailing definition of this class (e.g., the prevailing definition is in a shared library) Implementation summaries: 1. In llvm/lib/LTO/LTO.cpp, `IsVisibleToRegularObj` is updated to look at the global resolution's `IsPrevailing` bit for ThinLTO and regularLTO. 2. In llvm/tools/llvm-lto2/llvm-lto2.cpp, - three command line options are added so `llvm-lto2` can override `Conf.HasWholeProgramVisibility`, `Conf.ValidateAllVtablesHaveTypeInfos` and `Conf.AllVtablesHaveTypeInfos`. The test case is reduced from a small C++ program (main.cc, lib.cc/h pasted below in [1]). To reproduce the program failure without this patch, compile lib.cc into a shared library, and provide it to a ThinLTO build of main.cc (commands are pasted in [2]). [1] * lib.h ``` #include <cstdio> class Derived { public: void dispatch(); virtual void print(); virtual void sum(); }; void Derived::dispatch() { static_cast<Derived*>(this)->print(); static_cast<Derived*>(this)->sum(); } void Derived::sum() { printf("Derived::sum\n"); } __attribute__((noinline)) void* create(int i); __attribute__((noinline)) void* getPtr(int i); ``` * lib.cc ``` #include "lib.h" #include <cstdio> #include <iostream> class Derived2 : public Derived { public: void print() override { printf("DerivedSharedLib\n"); } void sum() override { printf("DerivedSharedLib::sum\n"); } }; void Derived::print() { printf("Derived\n"); } __attribute__((noinline)) void* create(int i) { if (i & 1) return new Derived2(); return new Derived(); } ``` * main.cc ``` cat main.cc #include "lib.h" class DerivedN : public Derived { public: }; __attribute__((noinline)) void* getPtr(int x) { return new DerivedN(); } int main() { Derived*b = static_cast<Derived*>(create(201)); b->dispatch(); delete b; Derived* a = static_cast<Derived*>(getPtr(202)); a->dispatch(); delete a; return 0; } ``` [2] ``` # compile lib.o in a shared library. $ ./bin/clang++ -O2 -fPIC -c lib.cc -o lib.o $ ./bin/clang++ -shared -o libdata.so lib.o # Provide the shared library in `-ldata` $ ./bin/clang++ -v -g -ldata --save-temps -fno-discard-value-names -Wl,-mllvm,-print-before=wholeprogramdevirt -Wl,-mllvm,-wholeprogramdevirt-check=trap -Rpass=wholeprogramdevirt -Wl,--lto-whole-program-visibility -Wl,--lto-validate-all-vtables-have-type-infos -mllvm -disable-icp=true -Wl,-mllvm,-disable-icp=false -flto=thin -fwhole-program-vtables -fno-split-lto-unit -fuse-ld=lld main.cc -L . -o main >/tmp/wholeprogramdevirt.ir 2>&1 # Run the program hits a segmentation fault with `-Wl,-mllvm,-wholeprogramdevirt-check=trap` $ LD_LIBRARY_PATH=. ./main DerivedSharedLib Trace/breakpoint trap (core dumped) ```
2025-03-19 22:10:57 -07:00
return (It == GlobalResolutions->end() ||
It->second.VisibleOutsideSummary || !It->second.Prevailing);
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
};
// If allowed, upgrade public vcall visibility metadata to linkage unit
// visibility before whole program devirtualization in the optimizer.
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
updateVCallVisibilityInModule(
*RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO,
DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos,
IsVisibleToRegularObj);
updatePublicTypeTestCalls(*RegularLTO.CombinedModule,
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
WholeProgramVisibilityEnabledInLTO);
if (Conf.PreOptModuleHook &&
!Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
if (!Conf.CodeGenOnly) {
for (const auto &R : *GlobalResolutions) {
GlobalValue *GV =
RegularLTO.CombinedModule->getNamedValue(R.second.IRName);
if (!R.second.isPrevailingIRSymbol())
continue;
if (R.second.Partition != 0 &&
R.second.Partition != GlobalResolution::External)
continue;
// Ignore symbols defined in other partitions.
// Also skip declarations, which are not allowed to have internal linkage.
if (!GV || GV->hasLocalLinkage() || GV->isDeclaration())
continue;
// Symbols that are marked DLLImport or DLLExport should not be
// internalized, as they are either externally visible or referencing
// external symbols. Symbols that have AvailableExternally or Appending
// linkage might be used by future passes and should be kept as is.
// These linkages are seen in Unified regular LTO, because the process
// of creating split LTO units introduces symbols with that linkage into
// one of the created modules. Normally, only the ThinLTO backend would
// compile this module, but Unified Regular LTO processes both
// modules created by the splitting process as regular LTO modules.
if ((LTOMode == LTOKind::LTOK_UnifiedRegular) &&
((GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) ||
GV->hasAvailableExternallyLinkage() || GV->hasAppendingLinkage()))
continue;
GV->setUnnamedAddr(R.second.UnnamedAddr ? GlobalValue::UnnamedAddr::Global
: GlobalValue::UnnamedAddr::None);
if (EnableLTOInternalization && R.second.Partition == 0)
GV->setLinkage(GlobalValue::InternalLinkage);
}
if (Conf.PostInternalizeModuleHook &&
!Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
}
if (!RegularLTO.EmptyCombinedModule || Conf.AlwaysEmitRegularLTOObj) {
if (Error Err =
backend(Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
*RegularLTO.CombinedModule, ThinLTO.CombinedIndex))
return Err;
}
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
}
SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
RTLIB::RuntimeLibcallsInfo Libcalls(TT);
SmallVector<const char *> LibcallSymbols;
copy_if(Libcalls.getLibcallNames(), std::back_inserter(LibcallSymbols),
[](const char *Name) { return Name; });
return LibcallSymbols;
}
Error ThinBackendProc::emitFiles(
const FunctionImporter::ImportMapTy &ImportList, llvm::StringRef ModulePath,
const std::string &NewModulePath) const {
ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
GVSummaryPtrSet DeclarationSummaries;
std::error_code EC;
gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
ImportList, ModuleToSummariesForIndex,
DeclarationSummaries);
raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
sys::fs::OpenFlags::OF_None);
if (EC)
return createFileError("cannot open " + NewModulePath + ".thinlto.bc", EC);
writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
&DeclarationSummaries);
if (ShouldEmitImportsFiles) {
Error ImportFilesError = EmitImportsFiles(
ModulePath, NewModulePath + ".imports", ModuleToSummariesForIndex);
if (ImportFilesError)
return ImportFilesError;
}
return Error::success();
}
namespace {
class InProcessThinBackend : public ThinBackendProc {
protected:
AddStreamFn AddStream;
FileCache Cache;
DenseSet<GlobalValue::GUID> CfiFunctionDefs;
DenseSet<GlobalValue::GUID> CfiFunctionDecls;
bool ShouldEmitIndexFiles;
public:
2016-09-06 03:23:45 +00:00
InProcessThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
ThreadPoolStrategy ThinLTOParallelism,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, FileCache Cache, lto::IndexWriteCallback OnWrite,
bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles)
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
OnWrite, ShouldEmitImportsFiles, ThinLTOParallelism),
AddStream(std::move(AddStream)), Cache(std::move(Cache)),
ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
auto &Defs = CombinedIndex.cfiFunctionDefs();
CfiFunctionDefs.insert_range(Defs.guids());
auto &Decls = CombinedIndex.cfiFunctionDecls();
CfiFunctionDecls.insert_range(Decls.guids());
}
virtual Error runThinLTOBackendThread(
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> &ModuleMap) {
auto RunThinBackend = [&](AddStreamFn AddStream) {
LTOLLVMContext BackendContext(Conf);
Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
if (!MOrErr)
return MOrErr.takeError();
return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
ImportList, DefinedGlobals, &ModuleMap,
Conf.CodeGenOnly);
};
auto ModuleID = BM.getModuleIdentifier();
if (ShouldEmitIndexFiles) {
if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
return E;
}
if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
[](uint32_t V) { return V == 0; }))
// Cache disabled or no entry for this module in the combined index or
// no module hash.
return RunThinBackend(AddStream);
// The module may be cached, this helps handling it.
std::string Key = computeLTOCacheKey(
Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID);
if (Error Err = CacheAddStreamOrErr.takeError())
return Err;
AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
if (CacheAddStream)
return RunThinBackend(CacheAddStream);
return Error::success();
}
Error start(
unsigned Task, BitcodeModule BM,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
MapVector<StringRef, BitcodeModule> &ModuleMap) override {
StringRef ModulePath = BM.getModuleIdentifier();
2016-09-06 03:23:45 +00:00
assert(ModuleToDefinedGVSummaries.count(ModulePath));
const GVSummaryMapTy &DefinedGlobals =
ModuleToDefinedGVSummaries.find(ModulePath)->second;
BackendThreadPool.async(
[=](BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
&ResolvedODR,
2016-09-06 03:23:45 +00:00
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> &ModuleMap) {
if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
timeTraceProfilerInitialize(Conf.TimeTraceGranularity,
"thin backend");
Error E = runThinLTOBackendThread(
AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
ResolvedODR, DefinedGlobals, ModuleMap);
if (E) {
std::unique_lock<std::mutex> L(ErrMu);
if (Err)
Err = joinErrors(std::move(*Err), std::move(E));
else
Err = std::move(E);
}
if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
timeTraceProfilerFinishThread();
},
BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap));
if (OnWrite)
OnWrite(std::string(ModulePath));
return Error::success();
}
};
/// This backend is utilized in the first round of a two-codegen round process.
/// It first saves optimized bitcode files to disk before the codegen process
/// begins. After codegen, it stores the resulting object files in a scratch
/// buffer. Note the codegen data stored in the scratch buffer will be extracted
/// and merged in the subsequent step.
class FirstRoundThinBackend : public InProcessThinBackend {
AddStreamFn IRAddStream;
FileCache IRCache;
public:
FirstRoundThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
ThreadPoolStrategy ThinLTOParallelism,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream,
FileCache IRCache)
: InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
ModuleToDefinedGVSummaries, std::move(CGAddStream),
std::move(CGCache), /*OnWrite=*/nullptr,
/*ShouldEmitIndexFiles=*/false,
/*ShouldEmitImportsFiles=*/false),
IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {}
Error runThinLTOBackendThread(
AddStreamFn CGAddStream, FileCache CGCache, unsigned Task,
BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> &ModuleMap) override {
auto RunThinBackend = [&](AddStreamFn CGAddStream,
AddStreamFn IRAddStream) {
LTOLLVMContext BackendContext(Conf);
Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
if (!MOrErr)
return MOrErr.takeError();
return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex,
ImportList, DefinedGlobals, &ModuleMap,
Conf.CodeGenOnly, IRAddStream);
};
auto ModuleID = BM.getModuleIdentifier();
// Like InProcessThinBackend, we produce index files as needed for
// FirstRoundThinBackend. However, these files are not generated for
// SecondRoundThinBackend.
if (ShouldEmitIndexFiles) {
if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
return E;
}
assert((CGCache.isValid() == IRCache.isValid()) &&
"Both caches for CG and IR should have matching availability");
if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
[](uint32_t V) { return V == 0; }))
// Cache disabled or no entry for this module in the combined index or
// no module hash.
return RunThinBackend(CGAddStream, IRAddStream);
// Get CGKey for caching object in CGCache.
std::string CGKey = computeLTOCacheKey(
Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
Expected<AddStreamFn> CacheCGAddStreamOrErr =
CGCache(Task, CGKey, ModuleID);
if (Error Err = CacheCGAddStreamOrErr.takeError())
return Err;
AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr;
// Get IRKey for caching (optimized) IR in IRCache with an extra ID.
std::string IRKey = recomputeLTOCacheKey(CGKey, /*ExtraID=*/"IR");
Expected<AddStreamFn> CacheIRAddStreamOrErr =
IRCache(Task, IRKey, ModuleID);
if (Error Err = CacheIRAddStreamOrErr.takeError())
return Err;
AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr;
// Ideally, both CG and IR caching should be synchronized. However, in
// practice, their availability may differ due to different expiration
// times. Therefore, if either cache is missing, the backend process is
// triggered.
if (CacheCGAddStream || CacheIRAddStream) {
LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for "
<< BM.getModuleIdentifier() << "\n");
return RunThinBackend(CacheCGAddStream ? CacheCGAddStream : CGAddStream,
CacheIRAddStream ? CacheIRAddStream : IRAddStream);
}
return Error::success();
}
};
/// This backend operates in the second round of a two-codegen round process.
/// It starts by reading the optimized bitcode files that were saved during the
/// first round. The backend then executes the codegen only to further optimize
/// the code, utilizing the codegen data merged from the first round. Finally,
/// it writes the resulting object files as usual.
class SecondRoundThinBackend : public InProcessThinBackend {
std::unique_ptr<SmallVector<StringRef>> IRFiles;
stable_hash CombinedCGDataHash;
public:
SecondRoundThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
ThreadPoolStrategy ThinLTOParallelism,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, FileCache Cache,
std::unique_ptr<SmallVector<StringRef>> IRFiles,
stable_hash CombinedCGDataHash)
: InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
ModuleToDefinedGVSummaries, std::move(AddStream),
std::move(Cache),
/*OnWrite=*/nullptr,
/*ShouldEmitIndexFiles=*/false,
/*ShouldEmitImportsFiles=*/false),
IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {}
virtual Error runThinLTOBackendThread(
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> &ModuleMap) override {
auto RunThinBackend = [&](AddStreamFn AddStream) {
LTOLLVMContext BackendContext(Conf);
std::unique_ptr<Module> LoadedModule =
cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, *IRFiles);
return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
ImportList, DefinedGlobals, &ModuleMap,
/*CodeGenOnly=*/true);
};
auto ModuleID = BM.getModuleIdentifier();
if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
all_of(CombinedIndex.getModuleHash(ModuleID),
[](uint32_t V) { return V == 0; }))
// Cache disabled or no entry for this module in the combined index or
// no module hash.
return RunThinBackend(AddStream);
// Get Key for caching the final object file in Cache with the combined
// CGData hash.
std::string Key = computeLTOCacheKey(
Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
Key = recomputeLTOCacheKey(Key,
/*ExtraID=*/std::to_string(CombinedCGDataHash));
Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID);
if (Error Err = CacheAddStreamOrErr.takeError())
return Err;
AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
if (CacheAddStream) {
LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for "
<< BM.getModuleIdentifier() << "\n");
return RunThinBackend(CacheAddStream);
}
return Error::success();
}
};
} // end anonymous namespace
ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
lto::IndexWriteCallback OnWrite,
bool ShouldEmitIndexFiles,
bool ShouldEmitImportsFiles) {
auto Func =
[=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, FileCache Cache) {
return std::make_unique<InProcessThinBackend>(
Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
AddStream, Cache, OnWrite, ShouldEmitIndexFiles,
ShouldEmitImportsFiles);
};
return ThinBackend(Func, Parallelism);
}
StringLiteral lto::getThinLTODefaultCPU(const Triple &TheTriple) {
if (!TheTriple.isOSDarwin())
return "";
if (TheTriple.getArch() == Triple::x86_64)
return "core2";
if (TheTriple.getArch() == Triple::x86)
return "yonah";
if (TheTriple.isArm64e())
return "apple-a12";
if (TheTriple.getArch() == Triple::aarch64 ||
TheTriple.getArch() == Triple::aarch64_32)
return "cyclone";
return "";
}
// Given the original \p Path to an output file, replace any path
// prefix matching \p OldPrefix with \p NewPrefix. Also, create the
// resulting directory if it does not yet exist.
std::string lto::getThinLTOOutputFile(StringRef Path, StringRef OldPrefix,
StringRef NewPrefix) {
if (OldPrefix.empty() && NewPrefix.empty())
return std::string(Path);
SmallString<128> NewPath(Path);
llvm::sys::path::replace_path_prefix(NewPath, OldPrefix, NewPrefix);
StringRef ParentPath = llvm::sys::path::parent_path(NewPath.str());
if (!ParentPath.empty()) {
// Make sure the new directory exists, creating it if necessary.
if (std::error_code EC = llvm::sys::fs::create_directories(ParentPath))
llvm::errs() << "warning: could not create directory '" << ParentPath
<< "': " << EC.message() << '\n';
}
return std::string(NewPath);
}
namespace {
class WriteIndexesThinBackend : public ThinBackendProc {
[lld] Support separate native object file path in --thinlto-prefix-replace Currently, the --thinlto-prefix-replace="oldpath;newpath" option is used during distributed ThinLTO thin links to specify the mapping of the input bitcode object files' directory tree (oldpath) to the directory tree (newpath) used for both: 1) the output files of the thin link itself (the .thinlto.bc index files and the optional .imports files) 2) the specified object file paths written to the response file given in the --thinlto-index-only=${response} option, which is used by the final native link and must match the paths of the native object files that will be produced by ThinLTO backend compiles. This patch expands the --thinlto-prefix-replace option to allow a separate directory tree mapping to be specified for the object file paths written to the response file (number 2 above). This is important to support builds and build systems where the same output directory may not be written by multiple build actions (e.g. the thin link and the ThinLTO backend compiles). The new format is: --thinlto-prefix-replace="origpath;outpath[;objpath]" This replaces the origpath directory tree of the thin link input files with outpath when writing the thin link index and imports outputs (number 1 above). If objpath is specified it replaces origpath of the input files with objpath when writing the response file (number 2 above), otherwise it falls back to the old behavior of using outpath for this as well. Reviewed By: tejohnson, MaskRay Differential Revision: https://reviews.llvm.org/D144596
2023-04-04 09:57:53 -07:00
std::string OldPrefix, NewPrefix, NativeObjectPrefix;
raw_fd_ostream *LinkedObjectsFile;
public:
2016-09-06 03:23:45 +00:00
WriteIndexesThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
ThreadPoolStrategy ThinLTOParallelism,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
[lld] Support separate native object file path in --thinlto-prefix-replace Currently, the --thinlto-prefix-replace="oldpath;newpath" option is used during distributed ThinLTO thin links to specify the mapping of the input bitcode object files' directory tree (oldpath) to the directory tree (newpath) used for both: 1) the output files of the thin link itself (the .thinlto.bc index files and the optional .imports files) 2) the specified object file paths written to the response file given in the --thinlto-index-only=${response} option, which is used by the final native link and must match the paths of the native object files that will be produced by ThinLTO backend compiles. This patch expands the --thinlto-prefix-replace option to allow a separate directory tree mapping to be specified for the object file paths written to the response file (number 2 above). This is important to support builds and build systems where the same output directory may not be written by multiple build actions (e.g. the thin link and the ThinLTO backend compiles). The new format is: --thinlto-prefix-replace="origpath;outpath[;objpath]" This replaces the origpath directory tree of the thin link input files with outpath when writing the thin link index and imports outputs (number 1 above). If objpath is specified it replaces origpath of the input files with objpath when writing the response file (number 2 above), otherwise it falls back to the old behavior of using outpath for this as well. Reviewed By: tejohnson, MaskRay Differential Revision: https://reviews.llvm.org/D144596
2023-04-04 09:57:53 -07:00
std::string OldPrefix, std::string NewPrefix,
std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
OnWrite, ShouldEmitImportsFiles, ThinLTOParallelism),
OldPrefix(OldPrefix), NewPrefix(NewPrefix),
[lld] Support separate native object file path in --thinlto-prefix-replace Currently, the --thinlto-prefix-replace="oldpath;newpath" option is used during distributed ThinLTO thin links to specify the mapping of the input bitcode object files' directory tree (oldpath) to the directory tree (newpath) used for both: 1) the output files of the thin link itself (the .thinlto.bc index files and the optional .imports files) 2) the specified object file paths written to the response file given in the --thinlto-index-only=${response} option, which is used by the final native link and must match the paths of the native object files that will be produced by ThinLTO backend compiles. This patch expands the --thinlto-prefix-replace option to allow a separate directory tree mapping to be specified for the object file paths written to the response file (number 2 above). This is important to support builds and build systems where the same output directory may not be written by multiple build actions (e.g. the thin link and the ThinLTO backend compiles). The new format is: --thinlto-prefix-replace="origpath;outpath[;objpath]" This replaces the origpath directory tree of the thin link input files with outpath when writing the thin link index and imports outputs (number 1 above). If objpath is specified it replaces origpath of the input files with objpath when writing the response file (number 2 above), otherwise it falls back to the old behavior of using outpath for this as well. Reviewed By: tejohnson, MaskRay Differential Revision: https://reviews.llvm.org/D144596
2023-04-04 09:57:53 -07:00
NativeObjectPrefix(NativeObjectPrefix),
LinkedObjectsFile(LinkedObjectsFile) {}
Error start(
unsigned Task, BitcodeModule BM,
const FunctionImporter::ImportMapTy &ImportList,
const FunctionImporter::ExportSetTy &ExportList,
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
MapVector<StringRef, BitcodeModule> &ModuleMap) override {
StringRef ModulePath = BM.getModuleIdentifier();
// The contents of this file may be used as input to a native link, and must
// therefore contain the processed modules in a determinstic order that
// match the order they are provided on the command line. For that reason,
// we cannot include this in the asynchronously executed lambda below.
[lld] Support separate native object file path in --thinlto-prefix-replace Currently, the --thinlto-prefix-replace="oldpath;newpath" option is used during distributed ThinLTO thin links to specify the mapping of the input bitcode object files' directory tree (oldpath) to the directory tree (newpath) used for both: 1) the output files of the thin link itself (the .thinlto.bc index files and the optional .imports files) 2) the specified object file paths written to the response file given in the --thinlto-index-only=${response} option, which is used by the final native link and must match the paths of the native object files that will be produced by ThinLTO backend compiles. This patch expands the --thinlto-prefix-replace option to allow a separate directory tree mapping to be specified for the object file paths written to the response file (number 2 above). This is important to support builds and build systems where the same output directory may not be written by multiple build actions (e.g. the thin link and the ThinLTO backend compiles). The new format is: --thinlto-prefix-replace="origpath;outpath[;objpath]" This replaces the origpath directory tree of the thin link input files with outpath when writing the thin link index and imports outputs (number 1 above). If objpath is specified it replaces origpath of the input files with objpath when writing the response file (number 2 above), otherwise it falls back to the old behavior of using outpath for this as well. Reviewed By: tejohnson, MaskRay Differential Revision: https://reviews.llvm.org/D144596
2023-04-04 09:57:53 -07:00
if (LinkedObjectsFile) {
std::string ObjectPrefix =
NativeObjectPrefix.empty() ? NewPrefix : NativeObjectPrefix;
std::string LinkedObjectsFilePath =
getThinLTOOutputFile(ModulePath, OldPrefix, ObjectPrefix);
[lld] Support separate native object file path in --thinlto-prefix-replace Currently, the --thinlto-prefix-replace="oldpath;newpath" option is used during distributed ThinLTO thin links to specify the mapping of the input bitcode object files' directory tree (oldpath) to the directory tree (newpath) used for both: 1) the output files of the thin link itself (the .thinlto.bc index files and the optional .imports files) 2) the specified object file paths written to the response file given in the --thinlto-index-only=${response} option, which is used by the final native link and must match the paths of the native object files that will be produced by ThinLTO backend compiles. This patch expands the --thinlto-prefix-replace option to allow a separate directory tree mapping to be specified for the object file paths written to the response file (number 2 above). This is important to support builds and build systems where the same output directory may not be written by multiple build actions (e.g. the thin link and the ThinLTO backend compiles). The new format is: --thinlto-prefix-replace="origpath;outpath[;objpath]" This replaces the origpath directory tree of the thin link input files with outpath when writing the thin link index and imports outputs (number 1 above). If objpath is specified it replaces origpath of the input files with objpath when writing the response file (number 2 above), otherwise it falls back to the old behavior of using outpath for this as well. Reviewed By: tejohnson, MaskRay Differential Revision: https://reviews.llvm.org/D144596
2023-04-04 09:57:53 -07:00
*LinkedObjectsFile << LinkedObjectsFilePath << '\n';
}
BackendThreadPool.async(
[this](const StringRef ModulePath,
const FunctionImporter::ImportMapTy &ImportList,
const std::string &OldPrefix, const std::string &NewPrefix) {
std::string NewModulePath =
getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
auto E = emitFiles(ImportList, ModulePath, NewModulePath);
if (E) {
std::unique_lock<std::mutex> L(ErrMu);
if (Err)
Err = joinErrors(std::move(*Err), std::move(E));
else
Err = std::move(E);
return;
}
},
ModulePath, ImportList, OldPrefix, NewPrefix);
if (OnWrite)
OnWrite(std::string(ModulePath));
return Error::success();
}
bool isSensitiveToInputOrder() override {
// The order which modules are written to LinkedObjectsFile should be
// deterministic and match the order they are passed on the command line.
return true;
}
};
} // end anonymous namespace
ThinBackend lto::createWriteIndexesThinBackend(
ThreadPoolStrategy Parallelism, std::string OldPrefix,
std::string NewPrefix, std::string NativeObjectPrefix,
bool ShouldEmitImportsFiles, raw_fd_ostream *LinkedObjectsFile,
IndexWriteCallback OnWrite) {
auto Func =
[=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, FileCache Cache) {
return std::make_unique<WriteIndexesThinBackend>(
Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
OldPrefix, NewPrefix, NativeObjectPrefix, ShouldEmitImportsFiles,
LinkedObjectsFile, OnWrite);
};
return ThinBackend(Func, Parallelism);
}
Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
ThinLTO.CombinedIndex.releaseTemporaryMemory();
timeTraceProfilerBegin("ThinLink", StringRef(""));
auto TimeTraceScopeExit = llvm::make_scope_exit([]() {
if (llvm::timeTraceProfilerEnabled())
llvm::timeTraceProfilerEnd();
});
if (ThinLTO.ModuleMap.empty())
return Error::success();
if (ThinLTO.ModulesToCompile && ThinLTO.ModulesToCompile->empty()) {
llvm::errs() << "warning: [ThinLTO] No module compiled\n";
return Error::success();
}
if (Conf.CombinedIndexHook &&
!Conf.CombinedIndexHook(ThinLTO.CombinedIndex, GUIDPreservedSymbols))
return Error::success();
// Collect for each module the list of function it defines (GUID ->
// Summary).
DenseMap<StringRef, GVSummaryMapTy> ModuleToDefinedGVSummaries(
ThinLTO.ModuleMap.size());
ThinLTO.CombinedIndex.collectDefinedGVSummariesPerModule(
ModuleToDefinedGVSummaries);
// Create entries for any modules that didn't have any GV summaries
// (either they didn't have any GVs to start with, or we suppressed
// generation of the summaries because they e.g. had inline assembly
// uses that couldn't be promoted/renamed on export). This is so
// InProcessThinBackend::start can still launch a backend thread, which
// is passed the map of summaries for the module, without any special
// handling for this case.
for (auto &Mod : ThinLTO.ModuleMap)
if (!ModuleToDefinedGVSummaries.count(Mod.first))
ModuleToDefinedGVSummaries.try_emplace(Mod.first);
FunctionImporter::ImportListsTy ImportLists(ThinLTO.ModuleMap.size());
DenseMap<StringRef, FunctionImporter::ExportSetTy> ExportLists(
ThinLTO.ModuleMap.size());
StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
if (DumpThinCGSCCs)
ThinLTO.CombinedIndex.dumpSCCs(outs());
std::set<GlobalValue::GUID> ExportedGUIDs;
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
bool WholeProgramVisibilityEnabledInLTO =
Conf.HasWholeProgramVisibility &&
// If validation is enabled, upgrade visibility only when all vtables
// have typeinfos.
(!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
ThinLTO.CombinedIndex.setWithWholeProgramVisibility();
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
// If we're validating, get the vtable symbols that should not be
// upgraded because they correspond to typeIDs outside of index-based
// WPD info.
DenseSet<GlobalValue::GUID> VisibleToRegularObjSymbols;
if (WholeProgramVisibilityEnabledInLTO &&
Conf.ValidateAllVtablesHaveTypeInfos) {
// This returns true when the name is local or not defined. Locals are
// expected to be handled separately.
auto IsVisibleToRegularObj = [&](StringRef name) {
auto It = GlobalResolutions->find(name);
return (It == GlobalResolutions->end() ||
[LTO][WPD] Suppress WPD on a class if the LTO unit doesn't have the prevailing definition of this class (#131721) Before this patch, whole program devirtualization is suppressed on a class if any superclass is visible to regular object files, by recording the class GUID in `VisibleToRegularObjSymbols`. This patch suppresses whole program devirtualization on a class if the LTO unit doesn't have the prevailing definition of this class (e.g., the prevailing definition is in a shared library) Implementation summaries: 1. In llvm/lib/LTO/LTO.cpp, `IsVisibleToRegularObj` is updated to look at the global resolution's `IsPrevailing` bit for ThinLTO and regularLTO. 2. In llvm/tools/llvm-lto2/llvm-lto2.cpp, - three command line options are added so `llvm-lto2` can override `Conf.HasWholeProgramVisibility`, `Conf.ValidateAllVtablesHaveTypeInfos` and `Conf.AllVtablesHaveTypeInfos`. The test case is reduced from a small C++ program (main.cc, lib.cc/h pasted below in [1]). To reproduce the program failure without this patch, compile lib.cc into a shared library, and provide it to a ThinLTO build of main.cc (commands are pasted in [2]). [1] * lib.h ``` #include <cstdio> class Derived { public: void dispatch(); virtual void print(); virtual void sum(); }; void Derived::dispatch() { static_cast<Derived*>(this)->print(); static_cast<Derived*>(this)->sum(); } void Derived::sum() { printf("Derived::sum\n"); } __attribute__((noinline)) void* create(int i); __attribute__((noinline)) void* getPtr(int i); ``` * lib.cc ``` #include "lib.h" #include <cstdio> #include <iostream> class Derived2 : public Derived { public: void print() override { printf("DerivedSharedLib\n"); } void sum() override { printf("DerivedSharedLib::sum\n"); } }; void Derived::print() { printf("Derived\n"); } __attribute__((noinline)) void* create(int i) { if (i & 1) return new Derived2(); return new Derived(); } ``` * main.cc ``` cat main.cc #include "lib.h" class DerivedN : public Derived { public: }; __attribute__((noinline)) void* getPtr(int x) { return new DerivedN(); } int main() { Derived*b = static_cast<Derived*>(create(201)); b->dispatch(); delete b; Derived* a = static_cast<Derived*>(getPtr(202)); a->dispatch(); delete a; return 0; } ``` [2] ``` # compile lib.o in a shared library. $ ./bin/clang++ -O2 -fPIC -c lib.cc -o lib.o $ ./bin/clang++ -shared -o libdata.so lib.o # Provide the shared library in `-ldata` $ ./bin/clang++ -v -g -ldata --save-temps -fno-discard-value-names -Wl,-mllvm,-print-before=wholeprogramdevirt -Wl,-mllvm,-wholeprogramdevirt-check=trap -Rpass=wholeprogramdevirt -Wl,--lto-whole-program-visibility -Wl,--lto-validate-all-vtables-have-type-infos -mllvm -disable-icp=true -Wl,-mllvm,-disable-icp=false -flto=thin -fwhole-program-vtables -fno-split-lto-unit -fuse-ld=lld main.cc -L . -o main >/tmp/wholeprogramdevirt.ir 2>&1 # Run the program hits a segmentation fault with `-Wl,-mllvm,-wholeprogramdevirt-check=trap` $ LD_LIBRARY_PATH=. ./main DerivedSharedLib Trace/breakpoint trap (core dumped) ```
2025-03-19 22:10:57 -07:00
It->second.VisibleOutsideSummary || !It->second.Prevailing);
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
};
getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex,
VisibleToRegularObjSymbols,
IsVisibleToRegularObj);
}
// If allowed, upgrade public vcall visibility to linkage unit visibility in
// the summaries before whole program devirtualization below.
[WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659
2023-07-13 19:02:52 -07:00
updateVCallVisibilityInIndex(
ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO,
DynamicExportSymbols, VisibleToRegularObjSymbols);
// Perform index-based WPD. This will return immediately if there are
// no index entries in the typeIdMetadata map (e.g. if we are instead
// performing IR-based WPD in hybrid regular/thin LTO mode).
std::map<ValueInfo, std::vector<VTableSlotSummary>> LocalWPDTargetsMap;
runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
LocalWPDTargetsMap);
auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
};
if (EnableMemProfContextDisambiguation) {
MemProfContextDisambiguation ContextDisambiguation;
ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing);
}
// Figure out which symbols need to be internalized. This also needs to happen
// at -O0 because summary-based DCE is implemented using internalization, and
// we must apply DCE consistently with the full LTO module in order to avoid
// undefined references during the final link.
for (auto &Res : *GlobalResolutions) {
// If the symbol does not have external references or it is not prevailing,
// then not need to mark it as exported from a ThinLTO partition.
if (Res.second.Partition != GlobalResolution::External ||
!Res.second.isPrevailingIRSymbol())
continue;
auto GUID = GlobalValue::getGUID(
GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
// Mark exported unless index-based analysis determined it to be dead.
if (ThinLTO.CombinedIndex.isGUIDLive(GUID))
ExportedGUIDs.insert(GUID);
}
// Reset the GlobalResolutions to deallocate the associated memory, as there
// are no further accesses. We specifically want to do this before computing
// cross module importing, which adds to peak memory via the computed import
// and export lists.
Re-apply "[NFCI][LTO][lld] Optimize away symbol copies within LTO global resolution in ELF" (#107792) Fix the use-after-free bug and re-apply https://github.com/llvm/llvm-project/pull/106193 * Without the fix, the string referenced by `objSym.Name` could be destroyed even if string saver keeps a copy of the referenced string. This caused use-after-free. * The fix ([latest commit](https://github.com/llvm/llvm-project/pull/107792/commits/9776ed44cfb26172480145aed8f59ba78a6fa2ea)) updates `objSym.Name` to reference (via `StringRef`) the string saver's copy. Test: 1. For `lld/test/ELF/lto/asmundef.ll`, its test failure is reproducible with `-DLLVM_USE_SANITIZER=Address` and gone with the fix. 3. Run all tests by following https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild#try-local-changes. * Without the fix, `ELF/lto/asmundef.ll` aborted the multi-stage test at `@@@BUILD_STEP stage2/asan_ubsan check@@@`, defined [here](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh#L30) * With the fix, the [multi-stage test](https://github.com/llvm/llvm-zorg/blob/main/zorg/buildbot/builders/sanitizers/buildbot_fast.sh) pass stage2 {asan, ubsan, masan}. This is also the test used by https://lab.llvm.org/buildbot/#/builders/169 **Original commit message** `StringMap<T>` creates a [copy of the string](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMapEntry.h#L55-L58) for entry insertions and intentionally keep copies [since the implementation optimizes string memory usage](https://github.com/llvm/llvm-project/blob/d4c519e7b2ac21350ec08b23eda44bf4a2d3c974/llvm/include/llvm/ADT/StringMap.h#L124). On the other hand, linker keeps copies of symbol names [1] in `lld::elf::parseFiles` [2] before invoking `compileBitcodeFiles` [3]. This change proposes to optimize away string copies inside [LTO::GlobalResolutions](https://github.com/llvm/llvm-project/blob/24e791b4164986a1ca7776e3ae0292ef20d20c47/llvm/include/llvm/LTO/LTO.h#L409), which will make LTO indexing more memory efficient for ELF. There are similar opportunities for other (COFF, wasm, MachO) formats. The optimization takes place for lld (ELF) only. For the rest of use cases (gold plugin, `llvm-lto2`, etc), LTO owns a string saver to keep copies and use global resolution key for de-duplication. Together with @kazutakahirata's work to make `ComputeCrossModuleImport` more memory efficient, we see a ~20% peak memory usage reduction in a binary where peak memory usage needs to go down. Thanks to the optimization in https://github.com/llvm/llvm-project/commit/329ba523ccbbe68a12434926c92fd9a86494d958, the max (as opposed to the sum) of `ComputeCrossModuleImport` or `GlobalResolution` shows up in peak memory usage. * Regarding correctness, the set of [resolved](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/lib/LTO/LTO.cpp#L739) [per-module symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L188-L191) is a subset of [llvm::lto::InputFile::Symbols](https://github.com/llvm/llvm-project/blob/80c47ad3aec9d7f22e1b1bdc88960a91b66f89f1/llvm/include/llvm/LTO/LTO.h#L120). And bitcode symbol parsing saves symbol name when iterating `obj->symbols` in `BitcodeFile::parse` already. This change updates `BitcodeFile::parseLazy` to keep copies of per-module undefined symbols. * Presumably the undefined symbols in a LTO unit (copied in this patch in linker unique saver) is a small set compared with the set of symbols in global-resolution (copied before this patch), making this a worthwhile trade-off. Benchmarking this change alone shows measurable memory savings across various benchmarks. [1] ELF https://github.com/llvm/llvm-project/blob/1cea5c2138bef3d8fec75508df6dbb858e6e3560/lld/ELF/InputFiles.cpp#L1748 [2] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2863 [3] https://github.com/llvm/llvm-project/blob/ef7b18a53c0d186dcda1e322be6035407fdedb55/lld/ELF/Driver.cpp#L2995
2024-09-09 11:16:58 -07:00
releaseGlobalResolutionsMemory();
if (Conf.OptLevel > 0)
ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
isPrevailing, ImportLists, ExportLists);
// Any functions referenced by the jump table in the regular LTO object must
// be exported.
auto &Defs = ThinLTO.CombinedIndex.cfiFunctionDefs();
ExportedGUIDs.insert(Defs.guid_begin(), Defs.guid_end());
auto &Decls = ThinLTO.CombinedIndex.cfiFunctionDecls();
ExportedGUIDs.insert(Decls.guid_begin(), Decls.guid_end());
auto isExported = [&](StringRef ModuleIdentifier, ValueInfo VI) {
const auto &ExportList = ExportLists.find(ModuleIdentifier);
return (ExportList != ExportLists.end() && ExportList->second.count(VI)) ||
ExportedGUIDs.count(VI.getGUID());
};
// Update local devirtualized targets that were exported by cross-module
// importing or by other devirtualizations marked in the ExportedGUIDs set.
updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
LocalWPDTargetsMap);
[ThinLTO] Fix handling of weak interposable symbols Summary: Keep aliasees alive if their alias is live, otherwise we end up with an alias to a declaration, which is invalid. This can happen when the aliasee is weak and non-prevailing. This fix exposed the fact that we were then attempting to internalize the weak symbol, which was not exported as it was not prevailing. We should not internalize interposable symbols in general, unless this is the prevailing copy, since it can lead to incorrect inlining and other optimizations. Most of the changes in this patch are due to the restructuring required to pass down the prevailing callback. Finally, while implementing the test cases, I found that in the case of a weak aliasee that is still marked not live because its alias isn't live, after dropping the definition we incorrectly marked the declaration with weak linkage when resolving prevailing symbols in the module. This was due to some special case handling for symbols marked WeakLinkage in the summary located before instead of after a subsequent check for the symbol being a declaration. It turns out that we don't actually need this special case handling any more (looking back at the history, when that was added the code was structured quite differently) - we will correctly mark with weak linkage further below when the definition hasn't been dropped. Fixes PR42542. Reviewers: pcc Subscribers: mehdi_amini, inglorion, steven_wu, dexonsmith, dang, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66264 llvm-svn: 369766
2019-08-23 15:18:58 +00:00
thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
isPrevailing);
auto recordNewLinkage = [&](StringRef ModuleIdentifier,
GlobalValue::GUID GUID,
GlobalValue::LinkageTypes NewLinkage) {
ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
};
thinLTOResolvePrevailingInIndex(Conf, ThinLTO.CombinedIndex, isPrevailing,
recordNewLinkage, GUIDPreservedSymbols);
thinLTOPropagateFunctionAttrs(ThinLTO.CombinedIndex, isPrevailing);
generateParamAccessSummary(ThinLTO.CombinedIndex);
if (llvm::timeTraceProfilerEnabled())
llvm::timeTraceProfilerEnd();
TimeTraceScopeExit.release();
auto &ModuleMap =
ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
auto ProcessOneModule = [&](int I) -> Error {
auto &Mod = *(ModuleMap.begin() + I);
// Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
// combined module and parallel code generation partitions.
return BackendProcess->start(
RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
ImportLists[Mod.first], ExportLists[Mod.first],
ResolvedODR[Mod.first], ThinLTO.ModuleMap);
};
if (BackendProcess->getThreadCount() == 1 ||
BackendProcess->isSensitiveToInputOrder()) {
// Process the modules in the order they were provided on the
// command-line. It is important for this codepath to be used for
// WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
// ThinLTO objects in the same order as the inputs, which otherwise would
// affect the final link order.
for (int I = 0, E = ModuleMap.size(); I != E; ++I)
if (Error E = ProcessOneModule(I))
return E;
} else {
// When executing in parallel, process largest bitsize modules first to
// improve parallelism, and avoid starving the thread pool near the end.
// This saves about 15 sec on a 36-core machine while link `clang.exe`
// (out of 100 sec).
std::vector<BitcodeModule *> ModulesVec;
ModulesVec.reserve(ModuleMap.size());
for (auto &Mod : ModuleMap)
ModulesVec.push_back(&Mod.second);
for (int I : generateModulesOrdering(ModulesVec))
if (Error E = ProcessOneModule(I))
return E;
}
return BackendProcess->wait();
};
if (!CodeGenDataThinLTOTwoRounds) {
std::unique_ptr<ThinBackendProc> BackendProc =
ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
AddStream, Cache);
return RunBackends(BackendProc.get());
}
// Perform two rounds of code generation for ThinLTO:
// 1. First round: Perform optimization and code generation, outputting to
// temporary scratch objects.
// 2. Merge code generation data extracted from the temporary scratch objects.
// 3. Second round: Execute code generation again using the merged data.
LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n");
unsigned MaxTasks = getMaxTasks();
auto Parallelism = ThinLTO.Backend.getParallelism();
// Set up two additional streams and caches for storing temporary scratch
// objects and optimized IRs, using the same cache directory as the original.
cgdata::StreamCacheData CG(MaxTasks, Cache, "CG"), IR(MaxTasks, Cache, "IR");
// First round: Execute optimization and code generation, outputting to
// temporary scratch objects. Serialize the optimized IRs before initiating
// code generation.
LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n");
auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
CG.AddStream, CG.Cache, IR.AddStream, IR.Cache);
if (Error E = RunBackends(FirstRoundLTO.get()))
return E;
LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n");
auto CombinedHashOrErr = cgdata::mergeCodeGenData(*CG.getResult());
if (Error E = CombinedHashOrErr.takeError())
return E;
auto CombinedHash = *CombinedHashOrErr;
LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n");
// Second round: Read the optimized IRs and execute code generation using the
// merged data.
LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n");
auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>(
Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
AddStream, Cache, IR.getResult(), CombinedHash);
return RunBackends(SecondRoundLTO.get());
}
Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
StringRef RemarksFormat, bool RemarksWithHotness,
std::optional<uint64_t> RemarksHotnessThreshold, int Count) {
std::string Filename = std::string(RemarksFilename);
// For ThinLTO, file.opt.<format> becomes
// file.opt.<format>.thin.<num>.<format>.
if (!Filename.empty() && Count != -1)
Filename =
(Twine(Filename) + ".thin." + llvm::utostr(Count) + "." + RemarksFormat)
.str();
auto ResultOrErr = llvm::setupLLVMOptimizationRemarks(
Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness,
RemarksHotnessThreshold);
if (Error E = ResultOrErr.takeError())
return std::move(E);
if (*ResultOrErr)
(*ResultOrErr)->keep();
return ResultOrErr;
}
Expected<std::unique_ptr<ToolOutputFile>>
lto::setupStatsFile(StringRef StatsFilename) {
// Setup output file to emit statistics.
if (StatsFilename.empty())
return nullptr;
llvm::EnableStatistics(false);
std::error_code EC;
auto StatsFile =
std::make_unique<ToolOutputFile>(StatsFilename, EC, sys::fs::OF_None);
if (EC)
return errorCodeToError(EC);
StatsFile->keep();
return std::move(StatsFile);
}
// Compute the ordering we will process the inputs: the rough heuristic here
// is to sort them per size so that the largest module get schedule as soon as
// possible. This is purely a compile-time optimization.
std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
auto Seq = llvm::seq<int>(0, R.size());
std::vector<int> ModulesOrdering(Seq.begin(), Seq.end());
llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
auto LSize = R[LeftIndex]->getBuffer().size();
auto RSize = R[RightIndex]->getBuffer().size();
return LSize > RSize;
});
return ModulesOrdering;
}