2015-09-21 21:38:08 +00:00
|
|
|
//===- OutputSections.h -----------------------------------------*- C++ -*-===//
|
|
|
|
//
|
2019-01-19 08:50:56 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2015-09-21 21:38:08 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#ifndef LLD_ELF_OUTPUT_SECTIONS_H
|
|
|
|
#define LLD_ELF_OUTPUT_SECTIONS_H
|
|
|
|
|
2017-03-08 22:36:28 +00:00
|
|
|
#include "InputSection.h"
|
2017-07-27 19:22:43 +00:00
|
|
|
#include "LinkerScript.h"
|
2017-10-02 21:00:41 +00:00
|
|
|
#include "lld/Common/LLVM.h"
|
2022-10-02 13:23:52 -07:00
|
|
|
#include "llvm/Support/Compiler.h"
|
2022-08-24 09:40:03 -07:00
|
|
|
#include "llvm/Support/Parallel.h"
|
2022-02-07 21:53:34 -08:00
|
|
|
|
2018-11-14 21:05:20 +00:00
|
|
|
#include <array>
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2022-08-10 15:31:58 -04:00
|
|
|
namespace lld::elf {
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2016-12-19 17:01:01 +00:00
|
|
|
struct PhdrEntry;
|
2015-09-21 21:38:08 +00:00
|
|
|
|
[ELF] Parallelize --compress-debug-sections=zlib
When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed
debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and
in a --threads=8 link "Compress debug sections" takes ~70% time.
This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly.
DEFLATE blocks are a bit sequence. We need to ensure every shard starts
at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards
but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can
be used as well, but Z_FULL_FLUSH clears the hash table which just
wastes time.)
The last block requires the BFINAL flag. We call deflate with Z_FINISH
to set the flag as well as flush the output to a byte boundary. Under
the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a
non-compressed block (called stored block in zlib). RFC1951 says "Any
bits of input up to the next byte boundary are ignored."
In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total
speed is 2.54x. Because the hash table for one shard is not shared with the next
shard, the output is slightly larger. Better compression ratio can be achieved
by preloading the window size from the previous shard as dictionary
(`deflateSetDictionary`), but that is overkill.
```
# 1MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.3% +129Ki [ = ] 0 .debug_str
+0.1% +105Ki [ = ] 0 .debug_info
+0.3% +101Ki [ = ] 0 .debug_line
+0.2% +2.66Ki [ = ] 0 .debug_abbrev
+0.0% +1.19Ki [ = ] 0 .debug_ranges
+0.1% +341Ki [ = ] 0 TOTAL
# 2MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.2% +74.2Ki [ = ] 0 .debug_line
+0.1% +72.3Ki [ = ] 0 .debug_str
+0.0% +69.9Ki [ = ] 0 .debug_info
+0.1% +976 [ = ] 0 .debug_abbrev
+0.0% +882 [ = ] 0 .debug_ranges
+0.0% +218Ki [ = ] 0 TOTAL
```
Bonus in not using zlib::compress
* we can compress a debug section larger than 4GiB
* peak memory usage is lower because for most shards the output size is less
than 50% input size (all less than 55% for a large binary I tested, but
decreasing the initial output size does not decrease memory usage)
Reviewed By: ikudrin
Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
|
|
|
struct CompressedData {
|
|
|
|
std::unique_ptr<SmallVector<uint8_t, 0>[]> shards;
|
2024-03-12 10:56:14 -07:00
|
|
|
uint32_t type = 0;
|
[ELF] Parallelize --compress-debug-sections=zlib
When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed
debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and
in a --threads=8 link "Compress debug sections" takes ~70% time.
This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly.
DEFLATE blocks are a bit sequence. We need to ensure every shard starts
at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards
but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can
be used as well, but Z_FULL_FLUSH clears the hash table which just
wastes time.)
The last block requires the BFINAL flag. We call deflate with Z_FINISH
to set the flag as well as flush the output to a byte boundary. Under
the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a
non-compressed block (called stored block in zlib). RFC1951 says "Any
bits of input up to the next byte boundary are ignored."
In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total
speed is 2.54x. Because the hash table for one shard is not shared with the next
shard, the output is slightly larger. Better compression ratio can be achieved
by preloading the window size from the previous shard as dictionary
(`deflateSetDictionary`), but that is overkill.
```
# 1MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.3% +129Ki [ = ] 0 .debug_str
+0.1% +105Ki [ = ] 0 .debug_info
+0.3% +101Ki [ = ] 0 .debug_line
+0.2% +2.66Ki [ = ] 0 .debug_abbrev
+0.0% +1.19Ki [ = ] 0 .debug_ranges
+0.1% +341Ki [ = ] 0 TOTAL
# 2MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.2% +74.2Ki [ = ] 0 .debug_line
+0.1% +72.3Ki [ = ] 0 .debug_str
+0.0% +69.9Ki [ = ] 0 .debug_info
+0.1% +976 [ = ] 0 .debug_abbrev
+0.0% +882 [ = ] 0 .debug_ranges
+0.0% +218Ki [ = ] 0 TOTAL
```
Bonus in not using zlib::compress
* we can compress a debug section larger than 4GiB
* peak memory usage is lower because for most shards the output size is less
than 50% input size (all less than 55% for a large binary I tested, but
decreasing the initial output size does not decrease memory usage)
Reviewed By: ikudrin
Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
|
|
|
uint32_t numShards = 0;
|
|
|
|
uint32_t checksum = 0;
|
2022-01-26 10:23:56 -08:00
|
|
|
uint64_t uncompressedSize;
|
[ELF] Parallelize --compress-debug-sections=zlib
When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed
debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and
in a --threads=8 link "Compress debug sections" takes ~70% time.
This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly.
DEFLATE blocks are a bit sequence. We need to ensure every shard starts
at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards
but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can
be used as well, but Z_FULL_FLUSH clears the hash table which just
wastes time.)
The last block requires the BFINAL flag. We call deflate with Z_FINISH
to set the flag as well as flush the output to a byte boundary. Under
the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a
non-compressed block (called stored block in zlib). RFC1951 says "Any
bits of input up to the next byte boundary are ignored."
In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total
speed is 2.54x. Because the hash table for one shard is not shared with the next
shard, the output is slightly larger. Better compression ratio can be achieved
by preloading the window size from the previous shard as dictionary
(`deflateSetDictionary`), but that is overkill.
```
# 1MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.3% +129Ki [ = ] 0 .debug_str
+0.1% +105Ki [ = ] 0 .debug_info
+0.3% +101Ki [ = ] 0 .debug_line
+0.2% +2.66Ki [ = ] 0 .debug_abbrev
+0.0% +1.19Ki [ = ] 0 .debug_ranges
+0.1% +341Ki [ = ] 0 TOTAL
# 2MiB shards
% bloaty clang.new -- clang.old
FILE SIZE VM SIZE
-------------- --------------
+0.2% +74.2Ki [ = ] 0 .debug_line
+0.1% +72.3Ki [ = ] 0 .debug_str
+0.0% +69.9Ki [ = ] 0 .debug_info
+0.1% +976 [ = ] 0 .debug_abbrev
+0.0% +882 [ = ] 0 .debug_ranges
+0.0% +218Ki [ = ] 0 TOTAL
```
Bonus in not using zlib::compress
* we can compress a debug section larger than 4GiB
* peak memory usage is lower because for most shards the output size is less
than 50% input size (all less than 55% for a large binary I tested, but
decreasing the initial output size does not decrease memory usage)
Reviewed By: ikudrin
Differential Revision: https://reviews.llvm.org/D117853
2022-01-25 10:29:04 -08:00
|
|
|
};
|
|
|
|
|
2015-09-22 00:16:19 +00:00
|
|
|
// This represents a section in an output file.
|
2017-02-24 15:07:30 +00:00
|
|
|
// It is composed of multiple InputSections.
|
2015-09-22 00:16:19 +00:00
|
|
|
// The writer creates multiple OutputSections and assign them unique,
|
2015-09-21 21:38:08 +00:00
|
|
|
// non-overlapping file offsets and VAs.
|
2022-03-08 11:23:41 -08:00
|
|
|
class OutputSection final : public SectionBase {
|
2015-09-21 21:38:08 +00:00
|
|
|
public:
|
2024-10-11 20:28:58 -07:00
|
|
|
OutputSection(Ctx &, StringRef name, uint32_t type, uint64_t flags);
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2017-03-08 22:36:28 +00:00
|
|
|
static bool classof(const SectionBase *s) {
|
|
|
|
return s->kind() == SectionBase::Output;
|
|
|
|
}
|
2017-10-11 02:28:28 +00:00
|
|
|
|
2024-11-19 21:59:47 -08:00
|
|
|
uint64_t getLMA() const;
|
2016-11-09 23:23:45 +00:00
|
|
|
template <typename ELFT> void writeHeaderTo(typename ELFT::Shdr *sHdr);
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2024-10-11 20:28:58 -07:00
|
|
|
Ctx &ctx;
|
2018-03-07 19:25:36 +00:00
|
|
|
uint32_t sectionIndex = UINT32_MAX;
|
2017-05-12 14:52:22 +00:00
|
|
|
unsigned sortRank;
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2016-07-27 14:10:56 +00:00
|
|
|
uint32_t getPhdrFlags() const;
|
2016-07-14 05:46:24 +00:00
|
|
|
|
2017-09-07 10:53:07 +00:00
|
|
|
// Pointer to the PT_LOAD segment, which this section resides in. This field
|
|
|
|
// is used to correctly compute file offset of a section. When two sections
|
|
|
|
// share the same load segment, difference between their file offsets should
|
|
|
|
// be equal to difference between their virtual addresses. To compute some
|
|
|
|
// section offset we use the following formula: Off = Off_first + VA -
|
|
|
|
// VA_first, where Off_first and VA_first is file offset and VA of first
|
|
|
|
// section in PT_LOAD.
|
|
|
|
PhdrEntry *ptLoad = nullptr;
|
2015-09-21 21:38:08 +00:00
|
|
|
|
2017-06-07 09:20:35 +00:00
|
|
|
// Pointer to a relocation section for this section. Usually nullptr because
|
|
|
|
// we consume relocations, but if --emit-relocs is specified (which is rare),
|
|
|
|
// it may have a non-null value.
|
|
|
|
OutputSection *relocationSection = nullptr;
|
|
|
|
|
[ELF] Reset OutputSection size prior to processing linker script commands
The size of an OutputSection is calculated early, to aid handling of compressed
debug sections. However, subsequent to this point, unused synthetic sections are
removed. In the event that an OutputSection, from which such an InputSection is
removed, is still required (e.g. because it has a symbol assignment), and no longer
has any InputSections, dot assignments, or BYTE()-family directives, the size
member is never updated when processing the commands. If the removed InputSection
had a non-zero size (such as a .got.plt section), the section ends up with the
wrong size in the output.
The fix is to reset the OutputSection size prior to processing the linker script
commands relating to that OutputSection. This ensures that the size is correct even
in the above situation.
Additionally, to reduce the risk of developers misusing OutputSection Size and
InputSection OutSecOff, they are set to simply the number of InputSections in an
OutputSection, and the corresponding index respectively. We cannot completely
stop using them, due to SHF_LINK_ORDER sections requiring them.
Compressed debug sections also require the full size. This is now calculated in
maybeCompress for these kinds of sections.
Reviewers: ruiu, rafael
Differential Revision: https://reviews.llvm.org/D38361
llvm-svn: 320472
2017-12-12 11:51:13 +00:00
|
|
|
// Initially this field is the number of InputSections that have been added to
|
|
|
|
// the OutputSection so far. Later on, after a call to assignAddresses, it
|
|
|
|
// corresponds to the Elf_Shdr member.
|
2016-11-09 23:23:45 +00:00
|
|
|
uint64_t size = 0;
|
[ELF] Reset OutputSection size prior to processing linker script commands
The size of an OutputSection is calculated early, to aid handling of compressed
debug sections. However, subsequent to this point, unused synthetic sections are
removed. In the event that an OutputSection, from which such an InputSection is
removed, is still required (e.g. because it has a symbol assignment), and no longer
has any InputSections, dot assignments, or BYTE()-family directives, the size
member is never updated when processing the commands. If the removed InputSection
had a non-zero size (such as a .got.plt section), the section ends up with the
wrong size in the output.
The fix is to reset the OutputSection size prior to processing the linker script
commands relating to that OutputSection. This ensures that the size is correct even
in the above situation.
Additionally, to reduce the risk of developers misusing OutputSection Size and
InputSection OutSecOff, they are set to simply the number of InputSections in an
OutputSection, and the corresponding index respectively. We cannot completely
stop using them, due to SHF_LINK_ORDER sections requiring them.
Compressed debug sections also require the full size. This is now calculated in
maybeCompress for these kinds of sections.
Reviewers: ruiu, rafael
Differential Revision: https://reviews.llvm.org/D38361
llvm-svn: 320472
2017-12-12 11:51:13 +00:00
|
|
|
|
|
|
|
// The following fields correspond to Elf_Shdr members.
|
2016-11-09 23:23:45 +00:00
|
|
|
uint64_t offset = 0;
|
|
|
|
uint64_t addr = 0;
|
2016-11-09 01:42:41 +00:00
|
|
|
uint32_t shName = 0;
|
2016-08-10 18:10:41 +00:00
|
|
|
|
2019-09-24 11:48:31 +00:00
|
|
|
void recordSection(InputSectionBase *isec);
|
2024-10-11 20:28:58 -07:00
|
|
|
void commitSection(InputSection *isec);
|
|
|
|
void finalizeInputSections();
|
2016-11-29 08:05:44 +00:00
|
|
|
|
2017-07-27 19:22:43 +00:00
|
|
|
// The following members are normally only used in linker scripts.
|
|
|
|
MemoryRegion *memRegion = nullptr;
|
2018-01-25 01:36:36 +00:00
|
|
|
MemoryRegion *lmaRegion = nullptr;
|
2017-07-27 19:22:43 +00:00
|
|
|
Expr addrExpr;
|
|
|
|
Expr alignExpr;
|
|
|
|
Expr lmaExpr;
|
|
|
|
Expr subalignExpr;
|
2024-08-01 10:22:03 -07:00
|
|
|
|
|
|
|
// Used by non-alloc SHT_CREL to hold the header and content byte stream.
|
|
|
|
uint64_t crelHeader = 0;
|
|
|
|
SmallVector<char, 0> crelBody;
|
|
|
|
|
2021-12-26 13:53:47 -08:00
|
|
|
SmallVector<SectionCommand *, 0> commands;
|
|
|
|
SmallVector<StringRef, 0> phdrs;
|
2022-11-26 19:19:15 -08:00
|
|
|
std::optional<std::array<uint8_t, 4>> filler;
|
2017-07-27 19:22:43 +00:00
|
|
|
ConstraintKind constraint = ConstraintKind::NoConstraint;
|
|
|
|
std::string location;
|
|
|
|
std::string memoryRegionName;
|
2018-01-12 09:07:35 +00:00
|
|
|
std::string lmaRegionName;
|
2018-02-16 10:42:58 +00:00
|
|
|
bool nonAlloc = false;
|
2022-02-17 12:10:58 -08:00
|
|
|
bool typeIsSet = false;
|
2018-03-01 12:27:04 +00:00
|
|
|
bool expressionsUseSymbols = false;
|
2019-04-26 06:59:30 +00:00
|
|
|
bool usedInExpression = false;
|
2018-06-27 08:08:12 +00:00
|
|
|
bool inOverlay = false;
|
2025-03-31 10:44:40 -07:00
|
|
|
bool firstInOverlay = false;
|
2017-07-27 19:22:43 +00:00
|
|
|
|
2019-06-03 20:14:25 +00:00
|
|
|
// Tracks whether the section has ever had an input section added to it, even
|
|
|
|
// if the section was later removed (e.g. because it is a synthetic section
|
|
|
|
// that wasn't needed). This is needed for orphan placement.
|
|
|
|
bool hasInputSections = false;
|
|
|
|
|
2022-05-04 01:10:45 -07:00
|
|
|
// The output section description is specified between DATA_SEGMENT_ALIGN and
|
|
|
|
// DATA_RELRO_END.
|
|
|
|
bool relro = false;
|
|
|
|
|
2024-10-03 20:06:58 -07:00
|
|
|
template <bool is64> void finalizeNonAllocCrel(Ctx &);
|
|
|
|
void finalize(Ctx &);
|
2022-08-24 09:40:03 -07:00
|
|
|
template <class ELFT>
|
2024-10-03 20:56:09 -07:00
|
|
|
void writeTo(Ctx &, uint8_t *buf, llvm::parallel::TaskGroup &tg);
|
2021-07-09 10:05:18 +01:00
|
|
|
// Check that the addends for dynamic relocations were written correctly.
|
2024-10-03 20:06:58 -07:00
|
|
|
void checkDynRelAddends(Ctx &);
|
|
|
|
template <class ELFT> void maybeCompress(Ctx &);
|
2017-07-27 19:22:43 +00:00
|
|
|
|
2018-06-16 12:11:34 +00:00
|
|
|
void sort(llvm::function_ref<int(InputSectionBase *s)> order);
|
2017-07-27 19:22:43 +00:00
|
|
|
void sortInitFini();
|
|
|
|
void sortCtorsDtors();
|
2017-10-06 21:42:37 +00:00
|
|
|
|
2024-12-13 11:52:09 -08:00
|
|
|
std::array<uint8_t, 4> getFiller(Ctx &);
|
|
|
|
|
2024-03-12 10:56:14 -07:00
|
|
|
// Used for implementation of --compress-debug-sections and
|
|
|
|
// --compress-sections.
|
|
|
|
CompressedData compressed;
|
|
|
|
|
2017-10-06 21:42:37 +00:00
|
|
|
private:
|
2022-08-24 09:40:03 -07:00
|
|
|
SmallVector<InputSection *, 0> storage;
|
2017-12-12 17:37:01 +00:00
|
|
|
};
|
|
|
|
|
2022-03-08 11:23:41 -08:00
|
|
|
struct OutputDesc final : SectionCommand {
|
|
|
|
OutputSection osec;
|
2024-10-11 20:28:58 -07:00
|
|
|
OutputDesc(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags)
|
|
|
|
: SectionCommand(OutputSectionKind), osec(ctx, name, type, flags) {}
|
2022-03-08 11:23:41 -08:00
|
|
|
|
|
|
|
static bool classof(const SectionCommand *c) {
|
|
|
|
return c->kind == OutputSectionKind;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-08-05 13:06:45 -07:00
|
|
|
// This represents a CLASS(class_name) { ... } that can be referenced by output
|
|
|
|
// section descriptions. If referenced more than once, the sections can be
|
|
|
|
// spilled to the next reference like --enable-non-contiguous-regions.
|
|
|
|
struct SectionClass final : public SectionBase {
|
|
|
|
SmallVector<InputSectionDescription *, 0> commands;
|
|
|
|
bool assigned = false;
|
|
|
|
|
2024-10-10 22:15:10 -07:00
|
|
|
SectionClass(StringRef name)
|
|
|
|
: SectionBase(Class, nullptr, name, 0, 0, 0, 0, 0, 0) {}
|
2024-08-05 13:06:45 -07:00
|
|
|
static bool classof(const SectionBase *s) { return s->kind() == Class; }
|
|
|
|
};
|
|
|
|
|
|
|
|
struct SectionClassDesc : SectionCommand {
|
|
|
|
SectionClass sc;
|
|
|
|
|
|
|
|
SectionClassDesc(StringRef name) : SectionCommand(ClassKind), sc(name) {}
|
|
|
|
|
|
|
|
static bool classof(const SectionCommand *c) { return c->kind == ClassKind; }
|
|
|
|
};
|
|
|
|
|
2017-07-27 19:22:43 +00:00
|
|
|
int getPriority(StringRef s);
|
|
|
|
|
2020-01-16 13:23:08 +00:00
|
|
|
InputSection *getFirstInputSection(const OutputSection *os);
|
2022-07-05 23:31:09 -07:00
|
|
|
llvm::ArrayRef<InputSection *>
|
|
|
|
getInputSections(const OutputSection &os,
|
|
|
|
SmallVector<InputSection *, 0> &storage);
|
2018-02-22 09:55:28 +00:00
|
|
|
|
2024-10-03 20:06:58 -07:00
|
|
|
uint64_t getHeaderSize(Ctx &);
|
2022-08-10 15:31:58 -04:00
|
|
|
} // namespace lld::elf
|
2015-11-04 02:11:57 +00:00
|
|
|
|
2016-07-12 09:49:43 +00:00
|
|
|
#endif
|