[scudo][standalone] Introduce the thread specific data structures

Summary: This CL adds the structures dealing with thread specific data for the allocator. This includes the thread specific data structure itself and two registries for said structures: an exclusive one, where each thread will have its own TSD struct, and a shared one, where a pool of TSD structs will be shared by all threads, with dynamic reassignment at runtime based on contention. This departs from the current Scudo implementation: we intend to make the Registry a template parameter of the allocator (as opposed to a single global entity), allowing various allocators to coexist with different TSD registry models. As a result, TSD registry and Allocator are tightly coupled. This also corrects a couple of things in other files that I noticed while adding this. Reviewers: eugenis, vitalybuka, morehouse, hctim Reviewed By: morehouse Subscribers: srhines, mgorny, delcypher, jfb, #sanitizers, llvm-commits Tags: #llvm, #sanitizers Differential Revision: https://reviews.llvm.org/D62258 llvm-svn: 362962
2025-04-27 05:46:06 +00:00 · 2019-06-10 16:50:52 +00:00 · 2019-06-10 16:50:52 +00:00 · 52bfd673d1
commit 52bfd673d1
parent 9aff1ddc14
10 changed files with 512 additions and 11 deletions
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@ -80,6 +80,9 @@ set(SCUDO_HEADERS
  size_class_map.h
  stats.h
  string_utils.h
+  tsd.h
+  tsd_exclusive.h
+  tsd_shared.h
  vector.h)

 if(COMPILER_RT_HAS_SCUDO_STANDALONE)
--- a/compiler-rt/lib/scudo/standalone/internal_defs.h
+++ b/compiler-rt/lib/scudo/standalone/internal_defs.h
@ -17,7 +17,7 @@
 #define SCUDO_DEBUG 0
 #endif

-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0]))

 // String related macros.

--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@ -16,7 +16,6 @@ namespace scudo {

 class StaticSpinMutex {
 public:
-  void initLinkerInitialized() {}
  void init() { atomic_store_relaxed(&State, 0); }

  void lock() {
--- a/compiler-rt/lib/scudo/standalone/quarantine.h
+++ b/compiler-rt/lib/scudo/standalone/quarantine.h
@ -185,8 +185,6 @@ public:
    atomic_store_relaxed(&MaxCacheSize, CacheSize);

    Cache.initLinkerInitialized();
-    CacheMutex.initLinkerInitialized();
-    RecyleMutex.initLinkerInitialized();
  }
  void init(uptr Size, uptr CacheSize) {
    memset(this, 0, sizeof(*this));
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@ -65,6 +65,7 @@ set(SCUDO_UNIT_TEST_SOURCES
  size_class_map_test.cc
  stats_test.cc
  strings_test.cc
+  tsd_test.cc
  vector_test.cc
  scudo_unit_test_main.cc)

--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cc
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cc
@ -47,7 +47,7 @@ template <typename Primary> static void testPrimary() {

 TEST(ScudoPrimaryTest, BasicPrimary) {
  using SizeClassMap = scudo::DefaultSizeClassMap;
-  testPrimary<scudo::SizeClassAllocator32<SizeClassMap, 24U>>();
+  testPrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
  testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }

@ -121,10 +121,14 @@ template <typename Primary> static void testIteratePrimary() {

 TEST(ScudoPrimaryTest, PrimaryIterate) {
  using SizeClassMap = scudo::DefaultSizeClassMap;
-  testIteratePrimary<scudo::SizeClassAllocator32<SizeClassMap, 24U>>();
+  testIteratePrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
  testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }

+// TODO(kostyak): reenable on 32-bit after implementing unmapTestOnly for the
+//                primary: we are running out of addressable space without.
+#if SCUDO_WORDSIZE == 64U
+
 static std::mutex Mutex;
 static std::condition_variable Cv;
 static bool Ready = false;
@ -142,7 +146,8 @@ template <typename Primary> static void performAllocations(Primary *Allocator) {
    const scudo::uptr Size = std::rand() % Primary::SizeClassMap::MaxSize;
    const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
    void *P = Cache.allocate(ClassId);
-    V.push_back(std::make_pair(ClassId, P));
+    if (P)
+      V.push_back(std::make_pair(ClassId, P));
  }
  while (!V.empty()) {
    auto Pair = V.back();
@ -155,8 +160,8 @@ template <typename Primary> static void performAllocations(Primary *Allocator) {
 template <typename Primary> static void testPrimaryThreaded() {
  std::unique_ptr<Primary> Allocator(new Primary);
  Allocator->init(/*ReleaseToOsInterval=*/-1);
-  std::thread Threads[10];
-  for (scudo::uptr I = 0; I < 10U; I++)
+  std::thread Threads[32];
+  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
    Threads[I] = std::thread(performAllocations<Primary>, Allocator.get());
  {
    std::unique_lock<std::mutex> Lock(Mutex);
@ -171,6 +176,8 @@ template <typename Primary> static void testPrimaryThreaded() {

 TEST(ScudoPrimaryTest, PrimaryThreaded) {
  using SizeClassMap = scudo::SvelteSizeClassMap;
-  testPrimaryThreaded<scudo::SizeClassAllocator32<SizeClassMap, 24U>>();
-  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
+  testPrimaryThreaded<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 28U>>();
 }
+
+#endif // SCUDO_WORDSIZE == 64U
--- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cc
+++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cc
@ -0,0 +1,152 @@
+//===-- tsd_test.cc ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tsd_exclusive.h"
+#include "tsd_shared.h"
+
+#include "gtest/gtest.h"
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+// We mock out an allocator with a TSD registry, mostly using empty stubs. The
+// cache contains a single volatile uptr, to be able to test that several
+// concurrent threads will not access or modify the same cache at the same time.
+template <class Config> class MockAllocator {
+public:
+  using ThisT = MockAllocator<Config>;
+  using TSDRegistryT = typename Config::template TSDRegistryT<ThisT>;
+  using CacheT = struct MockCache { volatile scudo::uptr Canary; };
+  using QuarantineCacheT = struct MockQuarantine {};
+
+  void initLinkerInitialized() {
+    // This should only be called once by the registry.
+    EXPECT_FALSE(Initialized);
+    Initialized = true;
+  }
+  void reset() { memset(this, 0, sizeof(*this)); }
+
+  void initCache(CacheT *Cache) { memset(Cache, 0, sizeof(*Cache)); }
+  void commitBack(scudo::TSD<MockAllocator> *TSD) {}
+  TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
+
+  bool isInitialized() { return Initialized; }
+
+private:
+  bool Initialized;
+  TSDRegistryT TSDRegistry;
+};
+
+struct OneCache {
+  template <class Allocator>
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 1U>;
+};
+
+struct SharedCaches {
+  template <class Allocator>
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 16U>;
+};
+
+struct ExclusiveCaches {
+  template <class Allocator>
+  using TSDRegistryT = scudo::TSDRegistryExT<Allocator>;
+};
+
+TEST(ScudoTSDTest, TSDRegistryInit) {
+  using AllocatorT = MockAllocator<OneCache>;
+  std::unique_ptr<AllocatorT> Allocator(new AllocatorT);
+  Allocator->reset();
+  EXPECT_FALSE(Allocator->isInitialized());
+
+  auto Registry = Allocator->getTSDRegistry();
+  Registry->initLinkerInitialized(Allocator.get());
+  EXPECT_TRUE(Allocator->isInitialized());
+}
+
+template <class AllocatorT> static void testRegistry() {
+  std::unique_ptr<AllocatorT> Allocator(new AllocatorT);
+  Allocator->reset();
+  EXPECT_FALSE(Allocator->isInitialized());
+
+  auto Registry = Allocator->getTSDRegistry();
+  Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/true);
+  EXPECT_TRUE(Allocator->isInitialized());
+
+  bool UnlockRequired;
+  auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+  EXPECT_NE(TSD, nullptr);
+  EXPECT_EQ(TSD->Cache.Canary, 0U);
+  if (UnlockRequired)
+    TSD->unlock();
+
+  Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false);
+  TSD = Registry->getTSDAndLock(&UnlockRequired);
+  EXPECT_NE(TSD, nullptr);
+  EXPECT_EQ(TSD->Cache.Canary, 0U);
+  memset(&TSD->Cache, 0x42, sizeof(TSD->Cache));
+  if (UnlockRequired)
+    TSD->unlock();
+}
+
+TEST(ScudoTSDTest, TSDRegistryBasic) {
+  testRegistry<MockAllocator<OneCache>>();
+  testRegistry<MockAllocator<SharedCaches>>();
+  testRegistry<MockAllocator<ExclusiveCaches>>();
+}
+
+static std::mutex Mutex;
+static std::condition_variable Cv;
+static bool Ready = false;
+
+template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
+  auto Registry = Allocator->getTSDRegistry();
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    while (!Ready)
+      Cv.wait(Lock);
+  }
+  Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
+  bool UnlockRequired;
+  auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+  EXPECT_NE(TSD, nullptr);
+  // For an exclusive TSD, the cache should be empty. We cannot guarantee the
+  // same for a shared TSD.
+  if (!UnlockRequired)
+    EXPECT_EQ(TSD->Cache.Canary, 0U);
+  // Transform the thread id to a uptr to use it as canary.
+  const scudo::uptr Canary = static_cast<scudo::uptr>(
+      std::hash<std::thread::id>{}(std::this_thread::get_id()));
+  TSD->Cache.Canary = Canary;
+  // Loop a few times to make sure that a concurrent thread isn't modifying it.
+  for (scudo::uptr I = 0; I < 4096U; I++)
+    EXPECT_EQ(TSD->Cache.Canary, Canary);
+  if (UnlockRequired)
+    TSD->unlock();
+}
+
+template <class AllocatorT> static void testRegistryThreaded() {
+  std::unique_ptr<AllocatorT> Allocator(new AllocatorT);
+  Allocator->reset();
+  std::thread Threads[32];
+  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
+    Threads[I] = std::thread(stressCache<AllocatorT>, Allocator.get());
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Ready = true;
+    Cv.notify_all();
+  }
+  for (auto &T : Threads)
+    T.join();
+}
+
+TEST(ScudoTSDTest, TSDRegistryThreaded) {
+  testRegistryThreaded<MockAllocator<OneCache>>();
+  testRegistryThreaded<MockAllocator<SharedCaches>>();
+  testRegistryThreaded<MockAllocator<ExclusiveCaches>>();
+}
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@ -0,0 +1,61 @@
+//===-- tsd.h ---------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TSD_H_
+#define SCUDO_TSD_H_
+
+#include "atomic_helpers.h"
+#include "common.h"
+#include "mutex.h"
+
+#include <limits.h> // for PTHREAD_DESTRUCTOR_ITERATIONS
+
+namespace scudo {
+
+template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
+  typename Allocator::CacheT Cache;
+  typename Allocator::QuarantineCacheT QuarantineCache;
+  u8 DestructorIterations;
+
+  void initLinkerInitialized(Allocator *Instance) {
+    Instance->initCache(&Cache);
+    DestructorIterations = PTHREAD_DESTRUCTOR_ITERATIONS;
+  }
+  void init(Allocator *Instance) {
+    memset(this, 0, sizeof(*this));
+    initLinkerInitialized(Instance);
+  }
+
+  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
+
+  INLINE bool tryLock() {
+    if (Mutex.tryLock()) {
+      atomic_store_relaxed(&Precedence, 0);
+      return true;
+    }
+    if (atomic_load_relaxed(&Precedence) == 0)
+      atomic_store_relaxed(
+          &Precedence,
+          static_cast<uptr>(getMonotonicTime() >> FIRST_32_SECOND_64(16, 0)));
+    return false;
+  }
+  INLINE void lock() {
+    atomic_store_relaxed(&Precedence, 0);
+    Mutex.lock();
+  }
+  INLINE void unlock() { Mutex.unlock(); }
+  INLINE uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
+
+private:
+  StaticSpinMutex Mutex;
+  atomic_uptr Precedence;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_TSD_H_
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@ -0,0 +1,114 @@
+//===-- tsd_exclusive.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TSD_EXCLUSIVE_H_
+#define SCUDO_TSD_EXCLUSIVE_H_
+
+#include "tsd.h"
+
+#include <pthread.h>
+
+namespace scudo {
+
+enum class ThreadState : u8 {
+  NotInitialized = 0,
+  Initialized,
+  TornDown,
+};
+
+template <class Allocator> void teardownThread(void *Ptr);
+
+template <class Allocator> struct TSDRegistryExT {
+  void initLinkerInitialized(Allocator *Instance) {
+    Instance->initLinkerInitialized();
+    CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
+    FallbackTSD = reinterpret_cast<TSD<Allocator> *>(
+        map(nullptr, sizeof(TSD<Allocator>), "scudo:tsd"));
+    FallbackTSD->initLinkerInitialized(Instance);
+    Initialized = true;
+  }
+  void init(Allocator *Instance) {
+    memset(this, 0, sizeof(*this));
+    initLinkerInitialized(Instance);
+  }
+
+  ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
+    if (LIKELY(State != ThreadState::NotInitialized))
+      return;
+    initThread(Instance, MinimalInit);
+  }
+
+  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+    if (LIKELY(State == ThreadState::Initialized)) {
+      *UnlockRequired = false;
+      return &ThreadTSD;
+    }
+    DCHECK(FallbackTSD);
+    FallbackTSD->lock();
+    *UnlockRequired = true;
+    return FallbackTSD;
+  }
+
+private:
+  void initOnceMaybe(Allocator *Instance) {
+    SpinMutexLock L(&Mutex);
+    if (Initialized)
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
+  }
+
+  // Using minimal initialization allows for global initialization while keeping
+  // the thread specific structure untouched. The fallback structure will be
+  // used instead.
+  NOINLINE void initThread(Allocator *Instance, bool MinimalInit) {
+    initOnceMaybe(Instance);
+    if (MinimalInit)
+      return;
+    CHECK_EQ(
+        pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Instance)), 0);
+    ThreadTSD.initLinkerInitialized(Instance);
+    State = ThreadState::Initialized;
+  }
+
+  pthread_key_t PThreadKey;
+  bool Initialized;
+  TSD<Allocator> *FallbackTSD;
+  StaticSpinMutex Mutex;
+  static THREADLOCAL ThreadState State;
+  static THREADLOCAL TSD<Allocator> ThreadTSD;
+
+  friend void teardownThread<Allocator>(void *Ptr);
+};
+
+template <class Allocator>
+THREADLOCAL TSD<Allocator> TSDRegistryExT<Allocator>::ThreadTSD;
+template <class Allocator>
+THREADLOCAL ThreadState TSDRegistryExT<Allocator>::State;
+
+template <class Allocator> void teardownThread(void *Ptr) {
+  typedef TSDRegistryExT<Allocator> TSDRegistryT;
+  Allocator *Instance = reinterpret_cast<Allocator *>(Ptr);
+  // The glibc POSIX thread-local-storage deallocation routine calls user
+  // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS.
+  // We want to be called last since other destructors might call free and the
+  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
+  // quarantine and swallowing the cache.
+  if (TSDRegistryT::ThreadTSD.DestructorIterations > 1) {
+    TSDRegistryT::ThreadTSD.DestructorIterations--;
+    // If pthread_setspecific fails, we will go ahead with the teardown.
+    if (LIKELY(pthread_setspecific(Instance->getTSDRegistry()->PThreadKey,
+                                   Ptr) == 0))
+      return;
+  }
+  TSDRegistryT::ThreadTSD.commitBack(Instance);
+  TSDRegistryT::State = ThreadState::TornDown;
+}
+
+} // namespace scudo
+
+#endif // SCUDO_TSD_EXCLUSIVE_H_
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@ -0,0 +1,166 @@
+//===-- tsd_shared.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TSD_SHARED_H_
+#define SCUDO_TSD_SHARED_H_
+
+#include "linux.h" // for getAndroidTlsPtr()
+#include "tsd.h"
+
+#include <pthread.h>
+
+namespace scudo {
+
+template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
+  void initLinkerInitialized(Allocator *Instance) {
+    Instance->initLinkerInitialized();
+    CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS
+    NumberOfTSDs = Min(Max(1U, getNumberOfCPUs()), MaxTSDCount);
+    TSDs = reinterpret_cast<TSD<Allocator> *>(
+        map(nullptr, sizeof(TSD<Allocator>) * NumberOfTSDs, "scudo:tsd"));
+    for (u32 I = 0; I < NumberOfTSDs; I++)
+      TSDs[I].initLinkerInitialized(Instance);
+    // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
+    // array of TSDs in a random order. For details, see:
+    // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/
+    for (u32 I = 0; I < NumberOfTSDs; I++) {
+      u32 A = I + 1;
+      u32 B = NumberOfTSDs;
+      // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes.
+      while (B != 0) {
+        const u32 T = A;
+        A = B;
+        B = T % B;
+      }
+      if (A == 1)
+        CoPrimes[NumberOfCoPrimes++] = I + 1;
+    }
+    Initialized = true;
+  }
+  void init(Allocator *Instance) {
+    memset(this, 0, sizeof(*this));
+    initLinkerInitialized(Instance);
+  }
+
+  ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
+                                     UNUSED bool MinimalInit) {
+    if (LIKELY(getCurrentTSD()))
+      return;
+    initThread(Instance);
+  }
+
+  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+    TSD<Allocator> *TSD = getCurrentTSD();
+    DCHECK(TSD);
+    *UnlockRequired = true;
+    // Try to lock the currently associated context.
+    if (TSD->tryLock())
+      return TSD;
+    // If that fails, go down the slow path.
+    return getTSDAndLockSlow(TSD);
+  }
+
+private:
+  ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
+#if SCUDO_ANDROID
+    *getAndroidTlsPtr() = reinterpret_cast<uptr>(CurrentTSD);
+#elif SCUDO_LINUX
+    ThreadTSD = CurrentTSD;
+#else
+    CHECK_EQ(
+        pthread_setspecific(PThreadKey, reinterpret_cast<void *>(CurrentTSD)),
+        0);
+#endif
+  }
+
+  ALWAYS_INLINE TSD<Allocator> *getCurrentTSD() {
+#if SCUDO_ANDROID
+    return reinterpret_cast<TSD<Allocator> *>(*getAndroidTlsPtr());
+#elif SCUDO_LINUX
+    return ThreadTSD;
+#else
+    return reinterpret_cast<TSD<Allocator> *>(pthread_getspecific(PThreadKey));
+#endif
+  }
+
+  void initOnceMaybe(Allocator *Instance) {
+    SpinMutexLock L(&Mutex);
+    if (Initialized)
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
+  }
+
+  NOINLINE void initThread(Allocator *Instance) {
+    initOnceMaybe(Instance);
+    // Initial context assignment is done in a plain round-robin fashion.
+    const u32 Index = atomic_fetch_add(&CurrentIndex, 1U, memory_order_relaxed);
+    setCurrentTSD(&TSDs[Index % NumberOfTSDs]);
+  }
+
+  NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {
+    if (MaxTSDCount > 1U && NumberOfTSDs > 1U) {
+      // Use the Precedence of the current TSD as our random seed. Since we are
+      // in the slow path, it means that tryLock failed, and as a result it's
+      // very likely that said Precedence is non-zero.
+      u32 RandState = static_cast<u32>(CurrentTSD->getPrecedence());
+      const u32 R = getRandomU32(&RandState);
+      const u32 Inc = CoPrimes[R % NumberOfCoPrimes];
+      u32 Index = R % NumberOfTSDs;
+      uptr LowestPrecedence = UINTPTR_MAX;
+      TSD<Allocator> *CandidateTSD = nullptr;
+      // Go randomly through at most 4 contexts and find a candidate.
+      for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) {
+        if (TSDs[Index].tryLock()) {
+          setCurrentTSD(&TSDs[Index]);
+          return &TSDs[Index];
+        }
+        const uptr Precedence = TSDs[Index].getPrecedence();
+        // A 0 precedence here means another thread just locked this TSD.
+        if (UNLIKELY(Precedence == 0))
+          continue;
+        if (Precedence < LowestPrecedence) {
+          CandidateTSD = &TSDs[Index];
+          LowestPrecedence = Precedence;
+        }
+        Index += Inc;
+        if (Index >= NumberOfTSDs)
+          Index -= NumberOfTSDs;
+      }
+      if (CandidateTSD) {
+        CandidateTSD->lock();
+        setCurrentTSD(CandidateTSD);
+        return CandidateTSD;
+      }
+    }
+    // Last resort, stick with the current one.
+    CurrentTSD->lock();
+    return CurrentTSD;
+  }
+
+  pthread_key_t PThreadKey;
+  atomic_u32 CurrentIndex;
+  u32 NumberOfTSDs;
+  TSD<Allocator> *TSDs;
+  u32 NumberOfCoPrimes;
+  u32 CoPrimes[MaxTSDCount];
+  bool Initialized;
+  StaticSpinMutex Mutex;
+#if SCUDO_LINUX && !SCUDO_ANDROID
+  static THREADLOCAL TSD<Allocator> *ThreadTSD;
+#endif
+};
+
+#if SCUDO_LINUX && !SCUDO_ANDROID
+template <class Allocator, u32 MaxTSDCount>
+THREADLOCAL TSD<Allocator>
+    *TSDRegistrySharedT<Allocator, MaxTSDCount>::ThreadTSD;
+#endif
+
+} // namespace scudo
+
+#endif // SCUDO_TSD_SHARED_H_