llvm-project/lld/ELF/OutputSections.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

179 lines
5.8 KiB
C
Raw Normal View History

//===- OutputSections.h -----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLD_ELF_OUTPUT_SECTIONS_H
#define LLD_ELF_OUTPUT_SECTIONS_H
#include "InputSection.h"
#include "LinkerScript.h"
#include "lld/Common/LLVM.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Parallel.h"
2022-02-07 21:53:34 -08:00
#include <array>
namespace lld::elf {
struct PhdrEntry;
[ELF] Parallelize --compress-debug-sections=zlib When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and in a --threads=8 link "Compress debug sections" takes ~70% time. This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly. DEFLATE blocks are a bit sequence. We need to ensure every shard starts at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can be used as well, but Z_FULL_FLUSH clears the hash table which just wastes time.) The last block requires the BFINAL flag. We call deflate with Z_FINISH to set the flag as well as flush the output to a byte boundary. Under the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a non-compressed block (called stored block in zlib). RFC1951 says "Any bits of input up to the next byte boundary are ignored." In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total speed is 2.54x. Because the hash table for one shard is not shared with the next shard, the output is slightly larger. Better compression ratio can be achieved by preloading the window size from the previous shard as dictionary (`deflateSetDictionary`), but that is overkill. ``` # 1MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.3% +129Ki [ = ] 0 .debug_str +0.1% +105Ki [ = ] 0 .debug_info +0.3% +101Ki [ = ] 0 .debug_line +0.2% +2.66Ki [ = ] 0 .debug_abbrev +0.0% +1.19Ki [ = ] 0 .debug_ranges +0.1% +341Ki [ = ] 0 TOTAL # 2MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.2% +74.2Ki [ = ] 0 .debug_line +0.1% +72.3Ki [ = ] 0 .debug_str +0.0% +69.9Ki [ = ] 0 .debug_info +0.1% +976 [ = ] 0 .debug_abbrev +0.0% +882 [ = ] 0 .debug_ranges +0.0% +218Ki [ = ] 0 TOTAL ``` Bonus in not using zlib::compress * we can compress a debug section larger than 4GiB * peak memory usage is lower because for most shards the output size is less than 50% input size (all less than 55% for a large binary I tested, but decreasing the initial output size does not decrease memory usage) Reviewed By: ikudrin Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
struct CompressedData {
std::unique_ptr<SmallVector<uint8_t, 0>[]> shards;
uint32_t type = 0;
[ELF] Parallelize --compress-debug-sections=zlib When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and in a --threads=8 link "Compress debug sections" takes ~70% time. This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly. DEFLATE blocks are a bit sequence. We need to ensure every shard starts at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can be used as well, but Z_FULL_FLUSH clears the hash table which just wastes time.) The last block requires the BFINAL flag. We call deflate with Z_FINISH to set the flag as well as flush the output to a byte boundary. Under the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a non-compressed block (called stored block in zlib). RFC1951 says "Any bits of input up to the next byte boundary are ignored." In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total speed is 2.54x. Because the hash table for one shard is not shared with the next shard, the output is slightly larger. Better compression ratio can be achieved by preloading the window size from the previous shard as dictionary (`deflateSetDictionary`), but that is overkill. ``` # 1MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.3% +129Ki [ = ] 0 .debug_str +0.1% +105Ki [ = ] 0 .debug_info +0.3% +101Ki [ = ] 0 .debug_line +0.2% +2.66Ki [ = ] 0 .debug_abbrev +0.0% +1.19Ki [ = ] 0 .debug_ranges +0.1% +341Ki [ = ] 0 TOTAL # 2MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.2% +74.2Ki [ = ] 0 .debug_line +0.1% +72.3Ki [ = ] 0 .debug_str +0.0% +69.9Ki [ = ] 0 .debug_info +0.1% +976 [ = ] 0 .debug_abbrev +0.0% +882 [ = ] 0 .debug_ranges +0.0% +218Ki [ = ] 0 TOTAL ``` Bonus in not using zlib::compress * we can compress a debug section larger than 4GiB * peak memory usage is lower because for most shards the output size is less than 50% input size (all less than 55% for a large binary I tested, but decreasing the initial output size does not decrease memory usage) Reviewed By: ikudrin Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
uint32_t numShards = 0;
uint32_t checksum = 0;
uint64_t uncompressedSize;
[ELF] Parallelize --compress-debug-sections=zlib When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and in a --threads=8 link "Compress debug sections" takes ~70% time. This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly. DEFLATE blocks are a bit sequence. We need to ensure every shard starts at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can be used as well, but Z_FULL_FLUSH clears the hash table which just wastes time.) The last block requires the BFINAL flag. We call deflate with Z_FINISH to set the flag as well as flush the output to a byte boundary. Under the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a non-compressed block (called stored block in zlib). RFC1951 says "Any bits of input up to the next byte boundary are ignored." In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total speed is 2.54x. Because the hash table for one shard is not shared with the next shard, the output is slightly larger. Better compression ratio can be achieved by preloading the window size from the previous shard as dictionary (`deflateSetDictionary`), but that is overkill. ``` # 1MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.3% +129Ki [ = ] 0 .debug_str +0.1% +105Ki [ = ] 0 .debug_info +0.3% +101Ki [ = ] 0 .debug_line +0.2% +2.66Ki [ = ] 0 .debug_abbrev +0.0% +1.19Ki [ = ] 0 .debug_ranges +0.1% +341Ki [ = ] 0 TOTAL # 2MiB shards % bloaty clang.new -- clang.old FILE SIZE VM SIZE -------------- -------------- +0.2% +74.2Ki [ = ] 0 .debug_line +0.1% +72.3Ki [ = ] 0 .debug_str +0.0% +69.9Ki [ = ] 0 .debug_info +0.1% +976 [ = ] 0 .debug_abbrev +0.0% +882 [ = ] 0 .debug_ranges +0.0% +218Ki [ = ] 0 TOTAL ``` Bonus in not using zlib::compress * we can compress a debug section larger than 4GiB * peak memory usage is lower because for most shards the output size is less than 50% input size (all less than 55% for a large binary I tested, but decreasing the initial output size does not decrease memory usage) Reviewed By: ikudrin Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
};
// This represents a section in an output file.
// It is composed of multiple InputSections.
// The writer creates multiple OutputSections and assign them unique,
// non-overlapping file offsets and VAs.
class OutputSection final : public SectionBase {
public:
2024-10-11 20:28:58 -07:00
OutputSection(Ctx &, StringRef name, uint32_t type, uint64_t flags);
static bool classof(const SectionBase *s) {
return s->kind() == SectionBase::Output;
}
uint64_t getLMA() const;
template <typename ELFT> void writeHeaderTo(typename ELFT::Shdr *sHdr);
2024-10-11 20:28:58 -07:00
Ctx &ctx;
uint32_t sectionIndex = UINT32_MAX;
unsigned sortRank;
uint32_t getPhdrFlags() const;
// Pointer to the PT_LOAD segment, which this section resides in. This field
// is used to correctly compute file offset of a section. When two sections
// share the same load segment, difference between their file offsets should
// be equal to difference between their virtual addresses. To compute some
// section offset we use the following formula: Off = Off_first + VA -
// VA_first, where Off_first and VA_first is file offset and VA of first
// section in PT_LOAD.
PhdrEntry *ptLoad = nullptr;
// Pointer to a relocation section for this section. Usually nullptr because
// we consume relocations, but if --emit-relocs is specified (which is rare),
// it may have a non-null value.
OutputSection *relocationSection = nullptr;
// Initially this field is the number of InputSections that have been added to
// the OutputSection so far. Later on, after a call to assignAddresses, it
// corresponds to the Elf_Shdr member.
uint64_t size = 0;
// The following fields correspond to Elf_Shdr members.
uint64_t offset = 0;
uint64_t addr = 0;
uint32_t shName = 0;
void recordSection(InputSectionBase *isec);
2024-10-11 20:28:58 -07:00
void commitSection(InputSection *isec);
void finalizeInputSections();
// The following members are normally only used in linker scripts.
MemoryRegion *memRegion = nullptr;
MemoryRegion *lmaRegion = nullptr;
Expr addrExpr;
Expr alignExpr;
Expr lmaExpr;
Expr subalignExpr;
[ELF] Support relocatable files using CREL with explicit addends ... using the temporary section type code 0x40000020 (`clang -c -Wa,--crel,--allow-experimental-crel`). LLVM will change the code and break compatibility (Clang and lld of different versions are not guaranteed to cooperate, unlike other features). CREL with implicit addends are not supported. --- Introduce `RelsOrRelas::crels` to iterate over SHT_CREL sections and update users to check `crels`. (The decoding performance is critical and error checking is difficult. Follow `skipLeb` and `R_*LEB128` handling, do not use `llvm::decodeULEB128`, whichs compiles to a lot of code.) A few users (e.g. .eh_frame, LLDDwarfObj, s390x) require random access. Pass `/*supportsCrel=*/false` to `relsOrRelas` to allocate a buffer and convert CREL to RELA (`relas` instead of `crels` will be used). Since allocating a buffer increases, the conversion is only performed when absolutely necessary. --- Non-alloc SHT_CREL sections may be created in -r and --emit-relocs links. SHT_CREL and SHT_RELA components need reencoding since r_offset/r_symidx/r_type/r_addend may change. (r_type may change because relocations referencing a symbol in a discarded section are converted to `R_*_NONE`). * SHT_CREL components: decode with `RelsOrRelas` and re-encode (`OutputSection::finalizeNonAllocCrel`) * SHT_RELA components: convert to CREL (`relToCrel`). An output section can only have one relocation section. * SHT_REL components: print an error for now. SHT_REL to SHT_CREL conversion for -r/--emit-relocs is complex and unsupported yet. Link: https://discourse.llvm.org/t/rfc-crel-a-compact-relocation-format-for-elf/77600 Pull Request: https://github.com/llvm/llvm-project/pull/98115
2024-08-01 10:22:03 -07:00
// Used by non-alloc SHT_CREL to hold the header and content byte stream.
uint64_t crelHeader = 0;
SmallVector<char, 0> crelBody;
SmallVector<SectionCommand *, 0> commands;
SmallVector<StringRef, 0> phdrs;
std::optional<std::array<uint8_t, 4>> filler;
ConstraintKind constraint = ConstraintKind::NoConstraint;
std::string location;
std::string memoryRegionName;
std::string lmaRegionName;
bool nonAlloc = false;
bool typeIsSet = false;
bool expressionsUseSymbols = false;
bool usedInExpression = false;
bool inOverlay = false;
bool firstInOverlay = false;
// Tracks whether the section has ever had an input section added to it, even
// if the section was later removed (e.g. because it is a synthetic section
// that wasn't needed). This is needed for orphan placement.
bool hasInputSections = false;
// The output section description is specified between DATA_SEGMENT_ALIGN and
// DATA_RELRO_END.
bool relro = false;
2024-10-03 20:06:58 -07:00
template <bool is64> void finalizeNonAllocCrel(Ctx &);
void finalize(Ctx &);
template <class ELFT>
void writeTo(Ctx &, uint8_t *buf, llvm::parallel::TaskGroup &tg);
// Check that the addends for dynamic relocations were written correctly.
2024-10-03 20:06:58 -07:00
void checkDynRelAddends(Ctx &);
template <class ELFT> void maybeCompress(Ctx &);
void sort(llvm::function_ref<int(InputSectionBase *s)> order);
void sortInitFini();
void sortCtorsDtors();
std::array<uint8_t, 4> getFiller(Ctx &);
// Used for implementation of --compress-debug-sections and
// --compress-sections.
CompressedData compressed;
private:
SmallVector<InputSection *, 0> storage;
};
struct OutputDesc final : SectionCommand {
OutputSection osec;
2024-10-11 20:28:58 -07:00
OutputDesc(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags)
: SectionCommand(OutputSectionKind), osec(ctx, name, type, flags) {}
static bool classof(const SectionCommand *c) {
return c->kind == OutputSectionKind;
}
};
// This represents a CLASS(class_name) { ... } that can be referenced by output
// section descriptions. If referenced more than once, the sections can be
// spilled to the next reference like --enable-non-contiguous-regions.
struct SectionClass final : public SectionBase {
SmallVector<InputSectionDescription *, 0> commands;
bool assigned = false;
SectionClass(StringRef name)
: SectionBase(Class, nullptr, name, 0, 0, 0, 0, 0, 0) {}
static bool classof(const SectionBase *s) { return s->kind() == Class; }
};
struct SectionClassDesc : SectionCommand {
SectionClass sc;
SectionClassDesc(StringRef name) : SectionCommand(ClassKind), sc(name) {}
static bool classof(const SectionCommand *c) { return c->kind == ClassKind; }
};
int getPriority(StringRef s);
InputSection *getFirstInputSection(const OutputSection *os);
llvm::ArrayRef<InputSection *>
getInputSections(const OutputSection &os,
SmallVector<InputSection *, 0> &storage);
2024-10-03 20:06:58 -07:00
uint64_t getHeaderSize(Ctx &);
} // namespace lld::elf
#endif