[Clang][C++23] P2071 Named universal character escapes

Implements [[ https://wg21.link/p2071r1 | P2071 Named Universal Character Escapes ]] - as an extension in all language mode, the patch not warn in c++23 mode will be done later once this paper is plenary approved (in July). We add * A code generator that transforms `UnicodeData.txt` and `NameAliases.txt` to a space efficient data structure that can be queried in `O(NameLength)` * A set of functions in `Unicode.h` to query that data, including * A function to find an exact match of a given Unicode character name * A function to perform a loose (ignoring case, space, underscore, medial hyphen) matching * A function returning the best matching codepoint for a given string per edit distance * Support of `\N{}` escape sequences in String and character Literals, with loose and typos diagnostics/fixits * Support of `\N{}` as UCN with loose matching diagnostics/fixits. Loose matching is considered an error to match closely the semantics of P2071. The generated data contributes to 280kB of data to the binaries. `UnicodeData.txt` and `NameAliases.txt` are not committed to the repository in this patch, and regenerating the data is a manual process. Reviewed By: tahonermann Differential Revision: https://reviews.llvm.org/D123064
2025-04-17 18:16:42 +00:00 · 2022-04-04 12:41:12 +02:00 · 2022-04-04 12:41:12 +02:00 · c92056d038
commit c92056d038
parent f8c1c9afd3
18 changed files with 22720 additions and 53 deletions
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
  "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;

 def ext_delimited_escape_sequence : Extension<
-  "delimited escape sequences are a Clang extension">,
+  "%select{delimited|named}0 escape sequences are a Clang extension">,
  InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
 def err_delimited_escape_empty : Error<
  "delimited escape sequence cannot be empty">;
@ -138,6 +138,13 @@ def err_delimited_escape_invalid : Error<
  "invalid digit '%0' in escape sequence">;
 def err_hex_escape_no_digits : Error<
  "\\%0 used with no following hex digits">;
+def err_invalid_ucn_name : Error<
+  "'%0' is not a valid Unicode character name">;
+def note_invalid_ucn_name_loose_matching : Note<
+  "characters names in Unicode escape sequences are sensitive to case and whitespaces">;
+def note_invalid_ucn_name_candidate : Note<
+  "did you mean %0 ('%2' U+%1)?">;
+
 def warn_ucn_escape_no_digits : Warning<
  "\\%0 used with no following hex digits; "
  "treating as '\\' followed by identifier">, InGroup<Unicode>;
@ -145,10 +152,10 @@ def err_ucn_escape_incomplete : Error<
  "incomplete universal character name">;
 def warn_delimited_ucn_incomplete : Warning<
  "incomplete delimited universal character name; "
-  "treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
+  "treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
 def warn_delimited_ucn_empty : Warning<
  "empty delimited universal character name; "
-  "treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
+  "treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
 def warn_ucn_escape_incomplete : Warning<
  "incomplete universal character name; "
  "treating as '\\' followed by identifier">, InGroup<Unicode>;
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@ -769,6 +769,11 @@ private:
  void codeCompleteIncludedFile(const char *PathStart,
                                const char *CompletionPoint, bool IsAngled);

+  llvm::Optional<uint32_t>
+  tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
+  llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
+                                           Token *Result);
+
  /// Read a universal character name.
  ///
  /// \param StartPtr The position in the source buffer after the initial '\'.
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -37,6 +37,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/NativeFormatting.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/UnicodeCharRanges.h"
 #include <algorithm>
 #include <cassert>
@ -3119,27 +3120,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
  return false;
 }

-uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
-                           Token *Result) {
+llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
+                                                  const char *SlashLoc,
+                                                  Token *Result) {
  unsigned CharSize;
  char Kind = getCharAndSize(StartPtr, CharSize);
-  bool Delimited = false;
-  bool FoundEndDelimiter = false;
-  unsigned Count = 0;
-  bool Diagnose = Result && !isLexingRawMode();
+  assert((Kind == 'u' || Kind == 'U') && "expected a UCN");

  unsigned NumHexDigits;
  if (Kind == 'u')
    NumHexDigits = 4;
  else if (Kind == 'U')
    NumHexDigits = 8;
-  else
-    return 0;
+
+  bool Delimited = false;
+  bool FoundEndDelimiter = false;
+  unsigned Count = 0;
+  bool Diagnose = Result && !isLexingRawMode();

  if (!LangOpts.CPlusPlus && !LangOpts.C99) {
    if (Diagnose)
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
-    return 0;
+    return llvm::None;
  }

  const char *CurPtr = StartPtr + CharSize;
@ -3166,14 +3168,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
        break;
      if (Diagnose)
        Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
-            << StringRef(&C, 1);
-      return 0;
+            << StringRef(KindLoc, 1);
+      return llvm::None;
    }

    if (CodePoint & 0xF000'0000) {
      if (Diagnose)
        Diag(KindLoc, diag::err_escape_too_large) << 0;
-      return 0;
+      return llvm::None;
    }

    CodePoint <<= 4;
@ -3187,7 +3189,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
      Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
                                       : diag::warn_ucn_escape_no_digits)
          << StringRef(KindLoc, 1);
-    return 0;
+    return llvm::None;
+  }
+
+  if (Delimited && Kind == 'U') {
+    if (Diagnose)
+      Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
+    return llvm::None;
  }

  if (!Delimited && Count != NumHexDigits) {
@ -3200,11 +3208,11 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
            << FixItHint::CreateReplacement(URange, "u");
      }
    }
-    return 0;
+    return llvm::None;
  }

  if (Delimited && PP) {
-    Diag(BufferPtr, diag::ext_delimited_escape_sequence);
+    Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
  }

  if (Result) {
@ -3217,6 +3225,110 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
  } else {
    StartPtr = CurPtr;
  }
+  return CodePoint;
+}
+
+llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
+                                                Token *Result) {
+  unsigned CharSize;
+  bool Diagnose = Result && !isLexingRawMode();
+
+  char C = getCharAndSize(StartPtr, CharSize);
+  assert(C == 'N' && "expected \\N{...}");
+
+  const char *CurPtr = StartPtr + CharSize;
+  const char *KindLoc = &CurPtr[-1];
+
+  C = getCharAndSize(CurPtr, CharSize);
+  if (C != '{') {
+    if (Diagnose)
+      Diag(StartPtr, diag::warn_ucn_escape_incomplete);
+    return llvm::None;
+  }
+  CurPtr += CharSize;
+  const char *StartName = CurPtr;
+  bool FoundEndDelimiter = false;
+  llvm::SmallVector<char, 30> Buffer;
+  while (C) {
+    C = getCharAndSize(CurPtr, CharSize);
+    CurPtr += CharSize;
+    if (C == '}') {
+      FoundEndDelimiter = true;
+      break;
+    }
+
+    if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
+      break;
+    Buffer.push_back(C);
+  }
+
+  if (!FoundEndDelimiter || Buffer.empty()) {
+    if (Diagnose)
+      Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
+                                       : diag::warn_delimited_ucn_incomplete)
+          << StringRef(KindLoc, 1);
+    return llvm::None;
+  }
+
+  StringRef Name(Buffer.data(), Buffer.size());
+  llvm::Optional<char32_t> Res =
+      llvm::sys::unicode::nameToCodepointStrict(Name);
+  llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
+  if (!Res) {
+    if (!isLexingRawMode()) {
+      Diag(StartPtr, diag::err_invalid_ucn_name)
+          << StringRef(Buffer.data(), Buffer.size());
+      LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
+      if (LooseMatch) {
+        Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
+            << FixItHint::CreateReplacement(
+                   makeCharRange(*this, StartName, CurPtr - CharSize),
+                   LooseMatch->Name);
+      }
+    }
+    // When finding a match using Unicode loose matching rules
+    // recover after having emitted a diagnostic.
+    if (!LooseMatch)
+      return llvm::None;
+    // We do not offer missspelled character names suggestions here
+    // as the set of what would be a valid suggestion depends on context,
+    // and we should not make invalid suggestions.
+  }
+
+  if (Diagnose && PP && !LooseMatch)
+    Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
+
+  if (LooseMatch)
+    Res = LooseMatch->CodePoint;
+
+  if (Result) {
+    Result->setFlag(Token::HasUCN);
+    if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
+      StartPtr = CurPtr;
+    else
+      while (StartPtr != CurPtr)
+        (void)getAndAdvanceChar(StartPtr, *Result);
+  } else {
+    StartPtr = CurPtr;
+  }
+  return *Res;
+}
+
+uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
+                           Token *Result) {
+
+  unsigned CharSize;
+  llvm::Optional<uint32_t> CodePointOpt;
+  char Kind = getCharAndSize(StartPtr, CharSize);
+  if (Kind == 'u' || Kind == 'U')
+    CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
+  else if (Kind == 'N')
+    CodePointOpt = tryReadNamedUCN(StartPtr, Result);
+
+  if (!CodePointOpt)
+    return 0;
+
+  uint32_t CodePoint = *CodePointOpt;

  // Don't apply C family restrictions to UCNs in assembly mode
  if (LangOpts.AsmPreprocessor)
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@ -27,6 +27,7 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Unicode.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@ -233,7 +234,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
      HadError = true;
      if (Diags)
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
-             diag::err_delimited_escape_missing_brace);
+             diag::err_delimited_escape_missing_brace)
+            << "o";

      break;
    }
@ -309,7 +311,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
          << tok::r_brace;
    else if (!HadError) {
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
-           diag::ext_delimited_escape_sequence);
+           diag::ext_delimited_escape_sequence)
+          << /*delimited*/ 0;
    }
  }

@ -335,7 +338,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
    char Kind = *I;
    ++I;

-    assert(Kind == 'u' || Kind == 'U');
+    assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
    uint32_t CodePoint = 0;

    if (Kind == 'u' && *I == '{') {
@ -349,6 +352,22 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
      continue;
    }

+    if (Kind == 'N') {
+      assert(*I == '{');
+      ++I;
+      auto Delim = std::find(I, Input.end(), '}');
+      assert(Delim != Input.end());
+      llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
+          llvm::sys::unicode::nameToCodepointLooseMatching(
+              StringRef(I, std::distance(I, Delim)));
+      assert(Res);
+      CodePoint = Res->CodePoint;
+      assert(CodePoint != 0xFFFFFFFF);
+      appendCodePoint(CodePoint, Buf);
+      I = Delim;
+      continue;
+    }
+
    unsigned NumHexDigits;
    if (Kind == 'u')
      NumHexDigits = 4;
@ -370,23 +389,20 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
  }
 }

-/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// return the UTF32.
-static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
-                             const char *ThisTokEnd,
-                             uint32_t &UcnVal, unsigned short &UcnLen,
-                             FullSourceLoc Loc, DiagnosticsEngine *Diags,
-                             const LangOptions &Features,
-                             bool in_char_string_literal = false) {
+static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
+                                    const char *&ThisTokBuf,
+                                    const char *ThisTokEnd, uint32_t &UcnVal,
+                                    unsigned short &UcnLen, bool &Delimited,
+                                    FullSourceLoc Loc, DiagnosticsEngine *Diags,
+                                    const LangOptions &Features,
+                                    bool in_char_string_literal = false) {
  const char *UcnBegin = ThisTokBuf;
+  bool HasError = false;
+  bool EndDelimiterFound = false;

  // Skip the '\u' char's.
  ThisTokBuf += 2;
-
-  bool Delimited = false;
-  bool EndDelimiterFound = false;
-  bool HasError = false;
-
+  Delimited = false;
  if (UcnBegin[1] == 'u' && in_char_string_literal &&
      ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
    Delimited = true;
@ -394,7 +410,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
    if (Diags)
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
-           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
+           diag::err_hex_escape_no_digits)
+          << StringRef(&ThisTokBuf[-1], 1);
    return false;
  }
  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@ -455,7 +472,136 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                     : diag::err_ucn_escape_incomplete);
    return false;
  }
+  return !HasError;
+}

+static void DiagnoseInvalidUnicodeCharacterName(
+    DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
+    const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
+    llvm::StringRef Name) {
+
+  Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+       diag::err_invalid_ucn_name)
+      << Name;
+
+  namespace u = llvm::sys::unicode;
+
+  llvm::Optional<u::LooseMatchingResult> Res =
+      u::nameToCodepointLooseMatching(Name);
+  if (Res) {
+    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+         diag::note_invalid_ucn_name_loose_matching)
+        << FixItHint::CreateReplacement(
+               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
+                                   TokRangeEnd),
+               Res->Name);
+    return;
+  }
+
+  unsigned Distance = 0;
+  SmallVector<u::MatchForCodepointName> Matches =
+      u::nearestMatchesForCodepointName(Name, 5);
+  assert(!Matches.empty() && "No unicode characters found");
+
+  for (const auto &Match : Matches) {
+    if (Distance == 0)
+      Distance = Match.Distance;
+    if (std::max(Distance, Match.Distance) -
+            std::min(Distance, Match.Distance) >
+        3)
+      break;
+    Distance = Match.Distance;
+
+    std::string Str;
+    llvm::UTF32 V = Match.Value;
+    LLVM_ATTRIBUTE_UNUSED bool Converted =
+        llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
+    assert(Converted && "Found a match wich is not a unicode character");
+
+    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+         diag::note_invalid_ucn_name_candidate)
+        << Match.Name << llvm::utohexstr(Match.Value)
+        << Str // FIXME: Fix the rendering of non printable characters
+        << FixItHint::CreateReplacement(
+               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
+                                   TokRangeEnd),
+               Match.Name);
+  }
+}
+
+static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
+                                  const char *&ThisTokBuf,
+                                  const char *ThisTokEnd, uint32_t &UcnVal,
+                                  unsigned short &UcnLen, FullSourceLoc Loc,
+                                  DiagnosticsEngine *Diags,
+                                  const LangOptions &Features) {
+  const char *UcnBegin = ThisTokBuf;
+  assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
+  ThisTokBuf += 2;
+  if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
+    if (Diags) {
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_delimited_escape_missing_brace)
+          << StringRef(&ThisTokBuf[-1], 1);
+    }
+    ThisTokBuf++;
+    return false;
+  }
+  ThisTokBuf++;
+  const char *ClosingBrace =
+      std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
+        return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
+      });
+  bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
+  bool Empty = ClosingBrace == ThisTokBuf;
+  if (Incomplete || Empty) {
+    if (Diags) {
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           Incomplete ? diag::err_ucn_escape_incomplete
+                      : diag::err_delimited_escape_empty)
+          << StringRef(&UcnBegin[1], 1);
+    }
+    ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
+    return false;
+  }
+  StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
+  ThisTokBuf = ClosingBrace + 1;
+  llvm::Optional<char32_t> Res =
+      llvm::sys::unicode::nameToCodepointStrict(Name);
+  if (!Res) {
+    if (Diags)
+      DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
+                                          &UcnBegin[3], ClosingBrace, Name);
+    return false;
+  }
+  UcnVal = *Res;
+  UcnLen = UcnVal > 0xFFFF ? 8 : 4;
+  return true;
+}
+
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+                             const char *ThisTokEnd, uint32_t &UcnVal,
+                             unsigned short &UcnLen, FullSourceLoc Loc,
+                             DiagnosticsEngine *Diags,
+                             const LangOptions &Features,
+                             bool in_char_string_literal = false) {
+
+  bool HasError;
+  const char *UcnBegin = ThisTokBuf;
+  bool IsDelimitedEscapeSequence = false;
+  bool IsNamedEscapeSequence = false;
+  if (ThisTokBuf[1] == 'N') {
+    IsNamedEscapeSequence = true;
+    HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
+                                      UcnVal, UcnLen, Loc, Diags, Features);
+  } else {
+    HasError =
+        !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
+                                 UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
+                                 Features, in_char_string_literal);
+  }
  if (HasError)
    return false;

@ -493,9 +639,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
         diag::warn_ucn_not_valid_in_c89_literal);

-  if (Delimited && Diags)
+  if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
-         diag::ext_delimited_escape_sequence);
+         diag::ext_delimited_escape_sequence)
+        << (IsNamedEscapeSequence ? 1 : 0);

  return true;
 }
@ -1559,7 +1706,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
      continue;
    }
    // Is this a Universal Character Name escape?
-    if (begin[1] == 'u' || begin[1] == 'U') {
+    if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
      unsigned short UcnLen = 0;
      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
                            FullSourceLoc(Loc, PP.getSourceManager()),
@ -1919,7 +2066,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
          continue;
        }
        // Is this a Universal Character Name escape?
-        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
+            ThisTokBuf[1] == 'N') {
          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
                          ResultPtr, hadError,
                          FullSourceLoc(StringToks[i].getLocation(), SM),
@ -2112,7 +2260,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,

    // Otherwise, this is an escape character.  Advance over it.
    bool HadError = false;
-    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
+    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
+        SpellingPtr[1] == 'N') {
      const char *EscapePtr = SpellingPtr;
      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
                                      1, Features, HadError);
--- a/clang/test/FixIt/fixit-unicode-named-escape-sequences.c
+++ b/clang/test/FixIt/fixit-unicode-named-escape-sequences.c
@ -0,0 +1,29 @@
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck -check-prefix=CHECK-MACHINE %s
+const char*
+\N{GREEK_SMALL_LETTER-OMICRON} = // expected-error {{'GREEK_SMALL_LETTER-OMICRON' is not a valid Unicode character name}} \
+                                 // expected-note {{sensitive to case and whitespaces}}
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:4-[[@LINE-2]]:30}:"GREEK SMALL LETTER OMICRON"
+
+"\N{zero width no break space}" // expected-error {{'zero width no break space' is not a valid Unicode character name}} \
+                               // expected-note {{sensitive to case and whitespaces}}
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:30}:"ZERO WIDTH NO-BREAK SPACE"
+
+"abc\N{MAN IN A BUSINESS SUIT LEVITATING}" // expected-error {{'MAN IN A BUSINESS SUIT LEVITATING' is not a valid Unicode character name}} \
+                                           // expected-note {{did you mean MAN IN BUSINESS SUIT LEVITATING ('🕴' U+1F574)?}}
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:41}:"MAN IN BUSINESS SUIT LEVITATING"
+
+"\N{AAA}" // expected-error {{'AAA' is not a valid Unicode character name}} \
+          // expected-note 5{{did you mean}}
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:8}:"ANT"
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-3]]:5-[[@LINE-3]]:8}:"ARC"
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-4]]:5-[[@LINE-4]]:8}:"AXE"
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-5]]:5-[[@LINE-5]]:8}:"BAT"
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-6]]:5-[[@LINE-6]]:8}:"CAT"
+
+"\N{BLACKCHESSBISHOP}" // expected-error {{'BLACKCHESSBISHOP' is not a valid Unicode character name}} \
+                       // expected-note {{sensitive to case and whitespaces}}
+// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:21}:"BLACK CHESS BISHOP"
+
+;
+
+
--- a/clang/test/Lexer/char-escapes-delimited.c
+++ b/clang/test/Lexer/char-escapes-delimited.c
@ -2,17 +2,20 @@
 // RUN: %clang_cc1 -x c -std=gnu11 -fsyntax-only -pedantic -verify %s
 // RUN: %clang_cc1 -x c++ -std=gnu++11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
 // RUN: %clang_cc1 -x c -std=gnu11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -x c++ -std=c++17 -ftrigraphs -fsyntax-only -pedantic -verify -DTRIGRAPHS=1 %s

 const char *errors =
-    "\u{}"  //expected-error {{delimited escape sequence cannot be empty}}
-    "\u{"   //expected-error {{expected '}'}}
-    "\u{h}" //expected-error {{invalid digit 'h' in escape sequence}}
-    "\x{}"  //expected-error {{delimited escape sequence cannot be empty}}
-    "\x{"   //expected-error {{expected '}'}}
-    "\x{h}" //expected-error {{invalid digit 'h' in escape sequence}}
-    "\o{}"  //expected-error {{delimited escape sequence cannot be empty}}
-    "\o{"   //expected-error {{expected '}'}}
-    "\o{8}" //expected-error {{invalid digit '8' in escape sequence}}
+    "\u{}"  // expected-error {{delimited escape sequence cannot be empty}}
+    "\u{"   // expected-error {{expected '}'}}
+    "\u{h}" // expected-error {{invalid digit 'h' in escape sequence}}
+    "\x{}"  // expected-error {{delimited escape sequence cannot be empty}}
+    "\x{"   // expected-error {{expected '}'}}
+    "\x{h}" // expected-error {{invalid digit 'h' in escape sequence}}
+    "\o{}"  // expected-error {{delimited escape sequence cannot be empty}}
+    "\o{"   // expected-error {{expected '}'}}
+    "\o"    // expected-error {{expected '{' after '\o' escape sequence}}
+    "\o{8}" // expected-error {{invalid digit '8' in escape sequence}}
+    "\U{8}" // expected-error {{\U used with no following hex digits}}
    ;

 void ucn(void) {
@ -70,6 +73,30 @@ void concat(void) {
  (void)"\o{12" "}"; // expected-error {{expected '}'}}
 }

+void named(void) {
+  char a = '\N{LOTUS}'; // expected-error{{character too large for enclosing character literal type}} \
+                        // expected-warning {{extension}}
+
+  char b  = '\N{DOLLAR SIGN}'; // expected-warning {{extension}}
+  char b_ = '\N{ DOL-LAR _SIGN }'; // expected-error {{' DOL-LAR _SIGN ' is not a valid Unicode character name}} \
+                               // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
+
+  char c = '\N{NOTATHING}'; // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
+                            // expected-note 5{{did you mean}}
+  char d = '\N{}';          // expected-error {{delimited escape sequence cannot be empty}}
+  char e = '\N{';           // expected-error {{incomplete universal character name}}
+
+  unsigned f = L'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
+
+  unsigned g = u'\N{LOTUS}'; // expected-error {{character too large for enclosing character literal type}} \
+                             // expected-warning {{extension}}
+
+  unsigned h = U'\N{LOTUS}';                      // expected-warning {{extension}}
+  unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
+  char j = '\NN';                                 // expected-error {{expected '{' after '\N' escape sequence}}
+  unsigned k = u'\N{LOTUS';                       // expected-error {{incomplete universal character name}}
+}
+
 void separators(void) {
  (void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
  (void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
@ -79,3 +106,12 @@ void separators(void) {
                 // expected-error@-1 2{{expected ';'}}
                 // expected-warning@-2 3{{expression result unused}}
 }
+
+#if L'\N{GREEK CAPITAL LETTER GAMMA}' != L'Γ' // expected-warning {{extension}}
+#error "oh no!"
+#endif
+
+#ifdef TRIGRAPHS
+static_assert('\N??<DOLLAR SIGN??>' == '$'); // expected-warning 2{{trigraph converted}} \
+                                             // expected-warning {{named escape sequences are a Clang extension}}
+#endif
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@ -39,9 +39,14 @@ extern int 𐠈;
 extern int ꙮ;
 extern int  \u1B4C;     // BALINESE LETTER ARCHAIC JNYA - Added in Unicode 14
 extern int  \U00016AA2; // TANGSA LETTER GA - Added in Unicode 14
+extern int _\N{TANGSA LETTER GA};
+extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
+                                // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
+
+
+
 // This character doesn't have the XID_Start property
 extern int  \U00016AC0; // TANGSA DIGIT ZERO  // expected-error {{expected unqualified-id}}
-extern int _\U00016AC0; // TANGSA DIGIT ZERO

 extern int 🌹; // expected-error {{unexpected character <U+1F339>}} \
                  expected-warning {{declaration does not declare anything}}
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@ -131,6 +131,7 @@ int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t);
 int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t);

 int operator""_\u{212f}(char);
+int operator""_\N{SCRIPT SMALL E}(char);

 int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e"";

--- a/clang/test/Preprocessor/ucn-pp-identifier.c
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@ -1,5 +1,6 @@
 // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef -ftrigraphs -DTRIGRAPHS=1
 // RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s

 #define \u00FC
@ -29,9 +30,14 @@

 // Make sure we reject disallowed UCNs
 #define \ufffe // expected-error {{macro name must be an identifier}}
-#define \U10000000  // expected-error {{macro name must be an identifier}}
-#define \u0061  // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
-#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
+#define \U10000000       // expected-error {{macro name must be an identifier}}
+#define \u0061           // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
+#define \u{fffe}        // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
+#define \N{ALERT}       // expected-error {{universal character name refers to a control character}} \
+                   // expected-error {{macro name must be an identifier}} \
+                   // expected-warning {{Clang extension}}
+#define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
+                        // expected-warning {{Clang extension}}

 #define a\u0024

@ -113,3 +119,20 @@ C 1
 #define \u{123456789}  // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
 #define \u{            // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
 #define \u{fgh}        // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
+#define \N{            // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}}
+#define \N{}           // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}}
+#define \N{NOTATHING}  // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
+                       // expected-error {{macro name must be an identifier}}
+#define \NN            // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
+#define \N{GREEK_SMALL-LETTERALPHA}  // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
+                                     // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
+
+#define CONCAT(A, B) A##B
+int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \
+                                           // expected-warning {{incomplete delimited universal character name}}
+
+#ifdef TRIGRAPHS
+int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // expected-warning{{amed escape sequences are a Clang extension}} \
+                                            // expected-warning 2{{trigraph converted}}
+
+#endif
--- a/clang/test/Sema/ucn-identifiers.c
+++ b/clang/test/Sema/ucn-identifiers.c
@ -18,6 +18,7 @@ void goodCalls(void) {
  über(2);
  \U000000FCber(3);
  \u{FC}ber(4); // expected-warning {{Clang extension}}
+  \N{LATIN SMALL LETTER U WITH DIAERESIS}ber(4); // expected-warning {{Clang extension}}
 }

 void badCalls(void) {
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@ -1107,6 +1107,7 @@ if( LLVM_INCLUDE_UTILS )
  add_subdirectory(utils/PerfectShuffle)
  add_subdirectory(utils/count)
  add_subdirectory(utils/not)
+  add_subdirectory(utils/UnicodeData)
  add_subdirectory(utils/yaml-bench)
 else()
  if ( LLVM_INCLUDE_TESTS )
--- a/llvm/include/llvm/Support/Unicode.h
+++ b/llvm/include/llvm/Support/Unicode.h
@ -14,6 +14,10 @@
 #ifndef LLVM_SUPPORT_UNICODE_H
 #define LLVM_SUPPORT_UNICODE_H

+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include <string>
+
 namespace llvm {
 class StringRef;

@ -63,6 +67,30 @@ int columnWidthUTF8(StringRef Text);
 /// rules.
 int foldCharSimple(int C);

+/// Maps the name or the alias of a Unicode character to its associated
+/// codepoints.
+/// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
+/// For compatibility with the semantics of named character escape sequences in
+/// C++, this mapping does an exact match sensitive to casing and spacing.
+/// \return The codepoint of the corresponding character, if any.
+Optional<char32_t> nameToCodepointStrict(StringRef Name);
+
+struct LooseMatchingResult {
+  char32_t CodePoint;
+  SmallString<64> Name;
+};
+
+Optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
+
+struct MatchForCodepointName {
+  std::string Name;
+  uint32_t Distance = 0;
+  char32_t Value = 0;
+};
+
+SmallVector<MatchForCodepointName>
+nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
+
 } // namespace unicode
 } // namespace sys
 } // namespace llvm
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@ -221,6 +221,8 @@ add_llvm_component_library(LLVMSupport
  TypeSize.cpp
  Unicode.cpp
  UnicodeCaseFold.cpp
+  UnicodeNameToCodepoint.cpp
+  UnicodeNameToCodepointGenerated.cpp
  VersionTuple.cpp
  VirtualFileSystem.cpp
  WithColor.cpp
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@ -0,0 +1,551 @@
+//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
+//-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions to map the name or alias of a unicode
+// character to its codepoint.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Unicode.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+extern const char *UnicodeNameToCodepointDict;
+extern const uint8_t *UnicodeNameToCodepointIndex;
+extern const std::size_t UnicodeNameToCodepointIndexSize;
+extern const std::size_t UnicodeNameToCodepointLargestNameSize;
+
+using BufferType = SmallString<64>;
+
+struct Node {
+  bool IsRoot = false;
+  char32_t Value = 0xFFFFFFFF;
+  uint32_t ChildrenOffset = 0;
+  bool HasSibling = false;
+  uint32_t Size = 0;
+  StringRef Name;
+  const Node *Parent = nullptr;
+
+  constexpr bool isValid() const {
+    return !Name.empty() || Value == 0xFFFFFFFF;
+  }
+  constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
+
+  std::string fullName() const {
+    std::string S;
+    // Reserve enough space for most unicode code points.
+    // The chosen value represent the 99th percentile of name size as of
+    // Unicode 14.
+    S.reserve(46);
+    const Node *N = this;
+    while (N) {
+      std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
+      N = N->Parent;
+    }
+    std::reverse(S.begin(), S.end());
+    return S;
+  }
+};
+
+static Node createRoot() {
+  Node N;
+  N.IsRoot = true;
+  N.ChildrenOffset = 1;
+  N.Size = 1;
+  return N;
+}
+
+static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
+  if (Offset == 0)
+    return createRoot();
+
+  uint32_t Origin = Offset;
+  Node N;
+  N.Parent = Parent;
+  uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
+  if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
+    return N;
+
+  bool LongName = NameInfo & 0x40;
+  bool HasValue = NameInfo & 0x80;
+  std::size_t Size = NameInfo & ~0xC0;
+  if (LongName) {
+    uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
+    NameOffset |= UnicodeNameToCodepointIndex[Offset++];
+    N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
+  } else {
+    N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
+  }
+  if (HasValue) {
+    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+    uint8_t M = UnicodeNameToCodepointIndex[Offset++];
+    uint8_t L = UnicodeNameToCodepointIndex[Offset++];
+    N.Value = ((H << 16) | (M << 8) | L) >> 3;
+
+    bool HasChildren = L & 0x02;
+    N.HasSibling = L & 0x01;
+
+    if (HasChildren) {
+      N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+    }
+  } else {
+    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+    N.HasSibling = H & 0x80;
+    bool HasChildren = H & 0x40;
+    H &= ~0xC0;
+    if (HasChildren) {
+      N.ChildrenOffset = (H << 16);
+      N.ChildrenOffset |=
+          (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
+      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+    }
+  }
+  N.Size = Offset - Origin;
+  return N;
+}
+
+static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
+                       std::size_t &Consummed, char &PreviousCharInName,
+                       char &PreviousCharInNeedle, bool IsPrefix = false) {
+
+  Consummed = 0;
+  if (Strict) {
+    if (!Name.startswith(Needle))
+      return false;
+    Consummed = Needle.size();
+    return true;
+  }
+  if (Needle.empty())
+    return true;
+
+  auto NamePos = Name.begin();
+  auto NeedlePos = Needle.begin();
+
+  char PreviousCharInNameOrigin = PreviousCharInName;
+  char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
+
+  auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
+                         bool IgnoreEnd = false) {
+    while (It != End) {
+      const auto Next = std::next(It);
+      // Ignore spaces, underscore, medial hyphens
+      // https://unicode.org/reports/tr44/#UAX44-LM2.
+      bool Ignore =
+          *It == ' ' || *It == '_' ||
+          (*It == '-' && isAlnum(PreviousChar) &&
+           ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
+      PreviousChar = *It;
+      if (!Ignore)
+        break;
+      ++It;
+    }
+    return It;
+  };
+
+  while (true) {
+    NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
+    NeedlePos =
+        IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
+    if (NeedlePos == Needle.end())
+      break;
+    if (NamePos == Name.end())
+      break;
+    if (toUpper(*NeedlePos) != toUpper(*NamePos))
+      break;
+    NeedlePos++;
+    NamePos++;
+  }
+  Consummed = std::distance(Name.begin(), NamePos);
+  if (NeedlePos != Needle.end()) {
+    PreviousCharInName = PreviousCharInNameOrigin;
+    PreviousCharInNeedle = PreviousCharInNeedleOrigin;
+  }
+  return NeedlePos == Needle.end();
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict,
+            char PreviousCharInName, char PreviousCharInNeedle,
+            BufferType &Buffer, const Node *Parent = nullptr) {
+  Node N = readNode(Offset, Parent);
+  std::size_t Consummed = 0;
+  bool DoesStartWith =
+      N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
+                             PreviousCharInName, PreviousCharInNeedle);
+  if (!DoesStartWith)
+    return {N, false, 0};
+
+  if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
+    return {N, true, N.Value};
+
+  if (N.hasChildren()) {
+    uint32_t ChildOffset = N.ChildrenOffset;
+    for (;;) {
+      Node C;
+      bool Matches;
+      uint32_t Value;
+      std::tie(C, Matches, Value) =
+          compareNode(ChildOffset, Name.substr(Consummed), Strict,
+                      PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
+      if (Matches) {
+        std::reverse_copy(C.Name.begin(), C.Name.end(),
+                          std::back_inserter(Buffer));
+        return {N, true, Value};
+      }
+      ChildOffset += C.Size;
+      if (!C.HasSibling)
+        break;
+    }
+  }
+  return {N, false, 0};
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
+  return compareNode(Offset, Name, Strict, 0, 0, Buffer);
+}
+
+// clang-format off
+constexpr const char *const HangulSyllables[][3] = {
+    { "G",  "A",   ""   },
+    { "GG", "AE",  "G"  },
+    { "N",  "YA",  "GG" },
+    { "D",  "YAE", "GS" },
+    { "DD", "EO",  "N", },
+    { "R",  "E",   "NJ" },
+    { "M",  "YEO", "NH" },
+    { "B",  "YE",  "D"  },
+    { "BB", "O",   "L"  },
+    { "S",  "WA",  "LG" },
+    { "SS", "WAE", "LM" },
+    { "",   "OE",  "LB" },
+    { "J",  "YO",  "LS" },
+    { "JJ", "U",   "LT" },
+    { "C",  "WEO", "LP" },
+    { "K",  "WE",  "LH" },
+    { "T",  "WI",  "M"  },
+    { "P",  "YU",  "B"  },
+    { "H",  "EU",  "BS" },
+    { 0,    "YI",  "S"  },
+    { 0,    "I",   "SS" },
+    { 0,    0,     "NG" },
+    { 0,    0,     "J"  },
+    { 0,    0,     "C"  },
+    { 0,    0,     "K"  },
+    { 0,    0,     "T"  },
+    { 0,    0,     "P"  },
+    { 0,    0,     "H"  }
+    };
+// clang-format on
+
+// Unicode 14.0
+// 3.12 Conjoining Jamo Behavior Common constants
+constexpr const char32_t SBase = 0xAC00;
+constexpr const uint32_t LCount = 19;
+constexpr const uint32_t VCount = 21;
+constexpr const uint32_t TCount = 28;
+
+static std::size_t findSyllable(StringRef Name, bool Strict,
+                                char &PreviousInName, int &Pos, int Column) {
+  assert(Column == 0 || Column == 1 || Column == 2);
+  static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
+  char NeedleStart = 0;
+  int Len = -1;
+  int Prev = PreviousInName;
+  for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
+    StringRef Syllable(HangulSyllables[I][Column]);
+    if (int(Syllable.size()) <= Len)
+      continue;
+    std::size_t Consummed = 0;
+    char PreviousInNameCopy = PreviousInName;
+    bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
+                                    PreviousInNameCopy, NeedleStart);
+    if (!DoesStartWith)
+      continue;
+    Len = Consummed;
+    Pos = I;
+    Prev = PreviousInNameCopy;
+  }
+  if (Len == -1)
+    return 0;
+  PreviousInName = Prev;
+  return size_t(Len);
+}
+
+static llvm::Optional<char32_t>
+nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+  Buffer.clear();
+  // Hangul Syllable Decomposition
+  std::size_t Consummed = 0;
+  char NameStart = 0, NeedleStart = 0;
+  bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
+                                  NameStart, NeedleStart);
+  if (!DoesStartWith)
+    return None;
+  Name = Name.substr(Consummed);
+  int L = -1, V = -1, T = -1;
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
+  Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
+  if (L != -1 && V != -1 && T != -1 && Name.empty()) {
+    if (!Strict) {
+      Buffer.append("HANGUL SYLLABLE ");
+      if (L != -1)
+        Buffer.append(HangulSyllables[L][0]);
+      if (V != -1)
+        Buffer.append(HangulSyllables[V][1]);
+      if (T != -1)
+        Buffer.append(HangulSyllables[T][2]);
+    }
+    return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
+           std::uint32_t(T);
+  }
+  // Otherwise, it's an illegal syllable name.
+  return None;
+}
+
+struct GeneratedNamesData {
+  StringRef Prefix;
+  uint32_t Start;
+  uint32_t End;
+};
+
+// Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
+// This needs to be kept in sync with
+// llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+static const GeneratedNamesData GeneratedNamesDataTable[] = {
+    {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
+    {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
+    {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
+    {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
+    {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
+    {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
+    {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
+    {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
+    {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
+    {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
+};
+
+static llvm::Optional<char32_t>
+nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+  for (auto &&Item : GeneratedNamesDataTable) {
+    Buffer.clear();
+    std::size_t Consummed = 0;
+    char NameStart = 0, NeedleStart = 0;
+    bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
+                                    NameStart, NeedleStart, /*isPrefix*/ true);
+    if (!DoesStartWith)
+      continue;
+    auto Number = Name.substr(Consummed);
+    unsigned long long V = 0;
+    // Be consistent about mandating upper casing.
+    if (Strict &&
+        llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
+      return {};
+    if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
+      continue;
+    if (!Strict) {
+      Buffer.append(Item.Prefix);
+      Buffer.append(utohexstr(V, true));
+    }
+    return V;
+  }
+  return None;
+}
+
+static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
+                                                BufferType &Buffer) {
+  if (Name.empty())
+    return None;
+
+  llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
+  if (!Res)
+    Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
+  if (Res)
+    return *Res;
+
+  Buffer.clear();
+  Node Node;
+  bool Matches;
+  uint32_t Value;
+  std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
+  if (Matches) {
+    std::reverse(Buffer.begin(), Buffer.end());
+    // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
+    // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
+    if (!Strict && Value == 0x116c &&
+        Name.find_insensitive("O-E") != StringRef::npos) {
+      Buffer = "HANGUL JUNGSEONG O-E";
+      Value = 0x1180;
+    }
+    return Value;
+  }
+  return None;
+}
+
+llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
+
+  BufferType Buffer;
+  auto Opt = nameToCodepoint(Name, true, Buffer);
+  return Opt;
+}
+
+llvm::Optional<LooseMatchingResult>
+nameToCodepointLooseMatching(StringRef Name) {
+  BufferType Buffer;
+  auto Opt = nameToCodepoint(Name, false, Buffer);
+  if (!Opt)
+    return None;
+  return LooseMatchingResult{*Opt, Buffer};
+}
+
+// Find the unicode character whose editing distance to Pattern
+// is shortest, using the Wagner–Fischer algorithm.
+llvm::SmallVector<MatchForCodepointName>
+nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
+  // We maintain a fixed size vector of matches,
+  // sorted by distance
+  // The worst match (with the biggest distance) are discarded when new elements
+  // are added.
+  std::size_t LargestEditDistance = 0;
+  llvm::SmallVector<MatchForCodepointName> Matches;
+  Matches.reserve(MaxMatchesCount + 1);
+
+  auto Insert = [&](const Node &Node, uint32_t Distance,
+                    char32_t Value) -> bool {
+    if (Distance > LargestEditDistance) {
+      if (Matches.size() == MaxMatchesCount)
+        return false;
+      LargestEditDistance = Distance;
+    }
+    // To avoid allocations, the creation of the name is delayed
+    // as much as possible.
+    std::string Name;
+    auto GetName = [&] {
+      if (Name.empty())
+        Name = Node.fullName();
+      return Name;
+    };
+
+    auto It = std::lower_bound(
+        Matches.begin(), Matches.end(), Distance,
+        [&](const MatchForCodepointName &a, std::size_t Distance) {
+          if (Distance == a.Distance)
+            return a.Name < GetName();
+          return a.Distance < Distance;
+        });
+    if (It == Matches.end() && Matches.size() == MaxMatchesCount)
+      return false;
+
+    MatchForCodepointName M{GetName(), Distance, Value};
+    Matches.insert(It, std::move(M));
+    if (Matches.size() > MaxMatchesCount)
+      Matches.pop_back();
+    return true;
+  };
+
+  // We ignore case, space, hyphens, etc,
+  // in both the search pattern and the prospective names.
+  auto Normalize = [](StringRef Name) {
+    std::string Out;
+    Out.reserve(Name.size());
+    for (char C : Name) {
+      if (isAlnum(C))
+        Out.push_back(toUpper(C));
+    }
+    return Out;
+  };
+  std::string NormalizedName = Normalize(Pattern);
+
+  // Allocate a matrix big enough for longest names.
+  const std::size_t Columns =
+      std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
+      1;
+
+  LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+      UnicodeNameToCodepointLargestNameSize + 1;
+
+  std::vector<char> Distances(
+      Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
+
+  auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
+    assert(Column < Columns);
+    assert(Row < Rows);
+    return Distances[Row * Columns + Column];
+  };
+
+  for (std::size_t I = 0; I < Columns; I++)
+    Get(I, 0) = I;
+
+  // Visit the childrens,
+  // Filling (and overriding) the matrix for the name fragment of each node
+  // iteratively. CompleteName is used to collect the actual name of potential
+  // match, respecting case and spacing.
+  auto VisitNode = [&](const Node &N, std::size_t Row,
+                       auto &VisitNode) -> void {
+    std::size_t J = 0;
+    for (; J < N.Name.size(); J++) {
+      if (!isAlnum(N.Name[J]))
+        continue;
+
+      Get(0, Row) = Row;
+
+      for (std::size_t I = 1; I < Columns; I++) {
+        const int Delete = Get(I - 1, Row) + 1;
+        const int Insert = Get(I, Row - 1) + 1;
+
+        const int Replace =
+            Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
+
+        Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
+      }
+
+      Row++;
+    }
+
+    unsigned Cost = Get(Columns - 1, Row - 1);
+    if (N.Value != 0xFFFFFFFF) {
+      Insert(N, Cost, N.Value);
+    }
+
+    if (N.hasChildren()) {
+      auto ChildOffset = N.ChildrenOffset;
+      for (;;) {
+        Node C = readNode(ChildOffset, &N);
+        ChildOffset += C.Size;
+        if (!C.isValid())
+          break;
+        VisitNode(C, Row, VisitNode);
+        if (!C.HasSibling)
+          break;
+      }
+    }
+  };
+
+  Node Root = createRoot();
+  VisitNode(Root, 1, VisitNode);
+  return Matches;
+}
+
+} // namespace unicode
+
+} // namespace sys
+} // namespace llvm
--- a/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
--- a/llvm/unittests/Support/UnicodeTest.cpp
+++ b/llvm/unittests/Support/UnicodeTest.cpp
@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/Support/Unicode.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/edit_distance.h"
 #include "llvm/Support/ConvertUTF.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"

 namespace llvm {
@ -101,6 +104,318 @@ TEST(Unicode, isPrintable) {
  }
 }

+TEST(Unicode, nameToCodepointStrict) {
+  auto map = [](StringRef Str) {
+    return nameToCodepointStrict(Str).getValueOr(0xFFFF'FFFF);
+  };
+
+  // generated codepoints
+  EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
+  EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
+  EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
+  EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
+  EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
+  EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
+  EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
+  EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
+  EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
+  EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
+  EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
+  EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
+  EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
+  EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
+  EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
+  EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
+  EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
+  EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
+  EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
+  EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
+  EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
+  EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
+  EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
+  EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
+  EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
+  EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
+  EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
+  EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
+  EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+
+  EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
+  EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
+  EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
+  EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
+  EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
+  EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
+  EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
+  EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
+
+  EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
+  EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
+  EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
+  EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
+  EXPECT_EQ(0x02235u, map("BECAUSE"));
+  EXPECT_EQ(0x1F514u, map("BELL"));
+  EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
+  EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
+  EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                          "ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
+
+  // Aliases
+  EXPECT_EQ(0x0000u, map("NULL"));
+  EXPECT_EQ(0x0007u, map("ALERT"));
+  EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
+  EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
+  EXPECT_EQ(0x000Au, map("LINE FEED"));
+  EXPECT_EQ(0x000Au, map("NEW LINE"));
+  EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
+  EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
+  EXPECT_EQ(0x2118u,
+            map("WEIERSTRASS ELLIPTIC FUNCTION"));      // correction
+  EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P"));          // correction
+  EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK"));           // alternate
+  EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
+
+  // Should perform exact case match
+  EXPECT_EQ(0xFFFFFFFFu, map(""));
+  EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
+  EXPECT_EQ(0xFFFFFFFFu, map("unicorn face"));
+  EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE"));
+  EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE"));
+  EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
+  EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i"));
+  EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i"));
+  EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI"));
+  EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
+  EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+  EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
+  EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
+  EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
+  EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
+  EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE"));
+
+  // Should not support abbreviations or figments
+  EXPECT_EQ(0xFFFFFFFFu, map("FVS1"));
+  EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET"));
+  EXPECT_EQ(0xFFFFFFFFu, map("BEL"));
+}
+
+TEST(Unicode, nameToCodepointLoose) {
+  auto map = [](StringRef Str) {
+    auto Opt = nameToCodepointLooseMatching(Str);
+    if (!Opt)
+      return char32_t(0xFFFF'FFFF);
+    return Opt->CodePoint;
+  };
+
+  // generated codepoints
+  EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
+  EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
+  EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
+  EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
+  EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
+  EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
+  EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
+  EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
+  EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
+  EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
+  EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
+  EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
+  EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
+  EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
+  EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
+  EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
+  EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
+  EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
+  EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
+  EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
+  EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
+  EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
+  EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
+  EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
+  EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
+  EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
+  EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
+  EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
+  EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+
+  EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
+  EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
+  EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
+  EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
+  EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
+  EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
+  EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
+  EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
+
+  EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
+  EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
+  EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
+  EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
+  EXPECT_EQ(0x02235u, map("BECAUSE"));
+  EXPECT_EQ(0x1F514u, map("BELL"));
+  EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
+  EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
+  EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                          "ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
+
+  // Aliases
+  EXPECT_EQ(0x0000u, map("NULL"));
+  EXPECT_EQ(0x0007u, map("ALERT"));
+  EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
+  EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
+  EXPECT_EQ(0x000Au, map("LINE FEED"));
+  EXPECT_EQ(0x000Au, map("NEW LINE"));
+  EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
+  EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
+  EXPECT_EQ(0x2118u,
+            map("WEIERSTRASS ELLIPTIC FUNCTION"));      // correction
+  EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P"));          // correction
+  EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK"));           // alternate
+  EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
+  EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate
+
+  // Should perform loose matching
+  EXPECT_EQ(0xFFFFFFFFu, map(""));
+  EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
+  EXPECT_EQ(0x0001F984u, map("unicorn face"));
+  EXPECT_EQ(0x0001F984u, map("UNICORN FaCE"));
+  EXPECT_EQ(0x0001F984u, map("UNICORNFaCE"));
+  EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
+  EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i"));
+  EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i"));
+  EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI"));
+  EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
+
+  EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+  EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
+  EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
+
+  EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
+  EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
+
+  // https://unicode.org/reports/tr44/#Matching_Names
+  // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not
+  EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E"));
+  EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE"));
+  EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-"));
+  EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -"));
+  EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --"));
+  EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE"));
+
+  EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A"));
+  EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA"));
+  EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
+  EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
+  EXPECT_EQ(0x0F60u, map("TIBETAN LETTER  -A"));
+  ;
+
+  // special case
+  EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
+  EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE"));
+
+  // names that are prefix to existing characters should not match
+  EXPECT_FALSE(nameToCodepointLooseMatching("B"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("BE"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("BEE"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("BEET"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("BEETL"));
+  EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE"));
+}
+
+} // namespace
+
+bool operator==(MatchForCodepointName a, MatchForCodepointName b) {
+  return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value;
+}
+
+namespace {
+
+TEST(Unicode, nearestMatchesForCodepointName) {
+  auto Normalize = [](StringRef Name) {
+    std::string Out;
+    Out.reserve(Name.size());
+    for (char C : Name) {
+      if (isAlnum(C))
+        Out.push_back(toUpper(C));
+    }
+    return Out;
+  };
+
+  auto L = [&](StringRef name) {
+    auto v = nearestMatchesForCodepointName(name, 3);
+    for (auto &r : v) {
+      auto A = Normalize(r.Name);
+      auto B = Normalize(name);
+      EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance);
+    }
+    return v;
+  };
+  using ::testing::ElementsAre;
+  using M = MatchForCodepointName;
+
+  ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C},
+                                 M{"ARC", 3, 0x2312}));
+  // shortest name
+  ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93},
+                                   M{"BOY", 2, 0x1F466}));
+
+  // longest name
+  ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF "
+                "MAKSURA INITIAL FORM"),
+              ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA INITIAL FORM",
+                            0, 0xFBFB},
+                          M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA FINAL FORM",
+                            4, 0xFBFA},
+                          M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA ISOLATED FORM",
+                            7, 0xFBF9}));
+
+  // same result with underscore, spaces, etc
+  ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH "
+                "ALEF MAKsURAINITIAL form_"),
+              ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA INITIAL FORM",
+                            0, 0xFBFB},
+                          M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA FINAL FORM",
+                            4, 0xFBFA},
+                          M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+                            "ABOVE WITH ALEF MAKSURA ISOLATED FORM",
+                            7, 0xFBF9}));
+
+  ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"),
+              ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
+                          M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
+                          M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
+
+  ASSERT_THAT(L("greekcapitalletter-lambda"),
+              ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
+                          M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
+                          M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
+
+  // typo http://www.unicode.org/notes/tn27/tn27-5.html
+  ASSERT_THAT(
+      L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"),
+      ElementsAre(
+          M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0,
+            0xFE18}, // typo
+          M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2,
+            0xFE18}, // correction
+          M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6,
+            0xFE17}));
+
+  // typo http://www.unicode.org/notes/tn27/tn27-5.html
+  ASSERT_THAT(
+      L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"),
+      ElementsAre(
+          M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5},
+          M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5},
+          M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7,
+            0x1D0C6}));
+}
+
 } // namespace
 } // namespace unicode
 } // namespace sys
--- a/llvm/utils/UnicodeData/CMakeLists.txt
+++ b/llvm/utils/UnicodeData/CMakeLists.txt
@ -0,0 +1,5 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_llvm_utility(UnicodeNameMappingGenerator
+    UnicodeNameMappingGenerator.cpp
+)
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@ -0,0 +1,486 @@
+//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
+// using UnicodeData.txt and NameAliases.txt available at
+// https://unicode.org/Public/14.0.0/ucd/
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include <algorithm>
+#include <array>
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+static const llvm::StringRef Letters =
+    " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+// Collect names UnicodeData.txt and AliasNames.txt
+// There may be multiple names per code points.
+static std::unordered_multimap<char32_t, std::string>
+loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
+  std::unordered_multimap<char32_t, std::string> CollectedCharacters;
+  auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
+    std::ifstream InputFile(File);
+    for (std::string Line; getline(InputFile, Line);) {
+      if (Line.empty() || !isxdigit(Line[0]))
+        continue;
+      auto FirstSemiPos = Line.find(';');
+      if (FirstSemiPos == std::string::npos)
+        continue;
+      auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
+      if (FirstSemiPos == std::string::npos)
+        continue;
+      unsigned long long CodePoint;
+      if (llvm::getAsUnsignedInteger(
+              llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
+        continue;
+      }
+
+      std::string Name =
+          Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
+
+      if (!Name.empty() && Name[0] == '<') {
+        // Ignore ranges of characters, as their name is either absent or
+        // generated.
+        continue;
+      }
+
+      // Some aliases are ignored for compatibility with C++
+      if (IsAliasFile) {
+        std::string Kind = Line.substr(SecondSemiPos + 1);
+        if (Kind != "control" && Kind != "correction" && Kind != "alternate")
+          continue;
+      }
+
+      auto InsertUnique = [&](char32_t CP, std::string Name) {
+        auto It = CollectedCharacters.find(CP);
+        while (It != std::end(CollectedCharacters) && It->first == CP) {
+          if (It->second == Name)
+            return;
+          ++It;
+        }
+        CollectedCharacters.insert({CP, std::move(Name)});
+      };
+      InsertUnique(CodePoint, std::move(Name));
+    }
+  };
+
+  FromFile(NamesFile);
+  FromFile(AliasesFile, true);
+  return CollectedCharacters;
+}
+
+class Trie {
+  struct Node;
+
+public:
+  // When inserting named codepoint
+  // We create a node per character in the name.
+  // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
+  // Once all  characters are inserted, the tree is compacted
+  void insert(llvm::StringRef Name, char32_t Codepoint) {
+    Node *N = Root.get();
+    for (auto Ch : Name) {
+      std::string Label(1, Ch);
+      auto It = std::find_if(N->Children.begin(), N->Children.end(),
+                             [&](const auto &C) { return C->Name == Label; });
+      if (It == N->Children.end()) {
+        It = N->Children.insert(It, std::make_unique<Node>(Label, N));
+      }
+      N = It->get();
+    }
+    N->Value = Codepoint;
+  }
+
+  void compact() { compact(Root.get()); }
+
+  // This creates 2 arrays of bytes from the tree:
+  // A serialized dictionary of node labels,
+  // And the nodes themselves.
+  // The name of each label is found by indexing into the dictionary.
+  // The longest names are inserted first into the dictionary,
+  // in the hope it will contain shorter labels as substring,
+  // thereby reducing duplication.
+  // We could theorically be more clever by trying to minimizing the size
+  // of the dictionary.
+  std::pair<std::string, std::vector<uint8_t>> serialize() {
+    std::set<std::string> Names = this->getNameFragments();
+    std::vector<std::string> Sorted(Names.begin(), Names.end());
+    std::sort(Sorted.begin(), Sorted.end(),
+              [](const auto &a, const auto &b) { return a.size() > b.size(); });
+    std::string Dict(Letters.begin(), Letters.end());
+    Dict.reserve(50000);
+    for (const std::string &Name : Sorted) {
+      if (Name.size() <= 1)
+        continue;
+      if (Dict.find(Name) != std::string::npos)
+        continue;
+      Dict += Name;
+    }
+
+    if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
+      fprintf(stderr, "Dictionary too big  to be serialized");
+      exit(1);
+    }
+
+    auto Bytes = dumpIndex(Dict);
+    return {Dict, Bytes};
+  }
+
+  std::set<std::string> getNameFragments() {
+    std::set<std::string> Keys;
+    collectKeys(Root.get(), Keys);
+    return Keys;
+  }
+
+  // Maps a valid char in an Unicode character name
+  // To a 6 bits index.
+  static uint8_t letter(char C) {
+    auto Pos = Letters.find(C);
+    assert(Pos != std::string::npos &&
+           "Invalid letter in Unicode character name");
+    return Pos;
+  }
+
+  // clang-format off
+  // +================+============+======================+=============+========+===+==============+===============+
+  // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            |
+  // +================+============+======================+=============+========+===+==============+===============+
+  // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  |
+  // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
+  // clang-format on
+
+  std::vector<uint8_t> dumpIndex(const std::string &Dict) {
+    struct ChildrenOffset {
+      Node *FirstChild;
+      std::size_t Offset;
+      bool HasValue;
+    };
+
+    // Keep track of the start of each node
+    // position in the serialized data.
+    std::unordered_map<Node *, int32_t> Offsets;
+
+    // Keep track of where to write the index
+    // of the first children
+    std::vector<ChildrenOffset> ChildrenOffsets;
+    std::unordered_map<Node *, bool> SiblingTracker;
+    std::deque<Node *> AllNodes;
+    std::vector<uint8_t> Bytes;
+    Bytes.reserve(250'000);
+    // This leading byte is used by the reading code to detect the root node.
+    Bytes.push_back(0);
+
+    auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
+      for (std::size_t Index = 0; Index < Children.size(); Index++) {
+        const std::unique_ptr<Node> &Child = Children[Index];
+        AllNodes.push_back(Child.get());
+        if (Index != Children.size() - 1)
+          SiblingTracker[Child.get()] = true;
+      }
+    };
+    CollectChildren(Root->Children);
+
+    while (!AllNodes.empty()) {
+      const std::size_t Offset = Bytes.size();
+      Node *const N = AllNodes.front();
+      AllNodes.pop_front();
+
+      assert(!N->Name.empty());
+      Offsets[N] = Offset;
+
+      uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
+      // Single letter node are indexed in 6 bits
+      if (N->Name.size() == 1) {
+        FirstByte |= letter(N->Name[0]);
+        Bytes.push_back(FirstByte);
+      } else {
+        // Otherwise we use a 16 bits index
+        FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
+        Bytes.push_back(FirstByte);
+        auto PosInDict = Dict.find(N->Name);
+        assert(PosInDict != std::string::npos);
+        uint8_t Low = PosInDict;
+        uint8_t High = ((PosInDict >> 8) & 0xFF);
+        Bytes.push_back(High);
+        Bytes.push_back(Low);
+      }
+
+      const bool HasSibling = SiblingTracker.count(N) != 0;
+      const bool HasChildren = N->Children.size() != 0;
+
+      if (!!N->Value) {
+        uint32_t Value = (*(N->Value) << 3);
+        uint8_t H = ((Value >> 16) & 0xFF);
+        uint8_t M = ((Value >> 8) & 0xFF);
+        uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
+                    uint8_t(HasChildren ? 0x02 : 0);
+
+        Bytes.push_back(H);
+        Bytes.push_back(M);
+        Bytes.push_back(L);
+
+        if (HasChildren) {
+          ChildrenOffsets.push_back(
+              ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
+          // index of the first children
+          Bytes.push_back(0x00);
+          Bytes.push_back(0x00);
+          Bytes.push_back(0x00);
+        }
+      } else {
+        // When there is no value (that's most intermediate nodes)
+        // Dispense of the 3 values bytes, and only store
+        // 1 byte to track whether the node has sibling and chidren
+        // + 2 bytes for the index of the first children if necessary.
+        // That index also uses bytes 0-6 of the previous byte.
+        uint8_t Byte =
+            uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
+        Bytes.push_back(Byte);
+        if (HasChildren) {
+          ChildrenOffsets.emplace_back(
+              ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
+          Bytes.push_back(0x00);
+          Bytes.push_back(0x00);
+        }
+      }
+      CollectChildren(N->Children);
+    }
+
+    // Once all the nodes are in the inndex
+    // Fill the bytes we left to indicate the position
+    // of the children
+    for (const ChildrenOffset &Parent : ChildrenOffsets) {
+      const auto It = Offsets.find(Parent.FirstChild);
+      assert(It != Offsets.end());
+      std::size_t Pos = It->second;
+      if (Parent.HasValue) {
+        Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
+      } else {
+        Bytes[Parent.Offset] =
+            Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
+      }
+      Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
+      Bytes[Parent.Offset + 2] = Pos & 0xFF;
+    }
+
+    // Add some padding so that the deserialization code
+    // doesn't try to read past the enf of the array.
+    Bytes.push_back(0);
+    Bytes.push_back(0);
+    Bytes.push_back(0);
+    Bytes.push_back(0);
+    Bytes.push_back(0);
+    Bytes.push_back(0);
+
+    return Bytes;
+  }
+
+private:
+  void collectKeys(Node *N, std::set<std::string> &Keys) {
+    Keys.insert(N->Name);
+    for (const std::unique_ptr<Node> &Child : N->Children) {
+      collectKeys(Child.get(), Keys);
+    }
+  }
+
+  // Merge sequences of 1-character nodes
+  // This greatly reduce the total number of nodes,
+  // and therefore the size of the index.
+  // When the tree gets serialized, we only have 5 bytes to store the
+  // size of a name. Overlong names (>32 characters) are therefore
+  // kep into separate nodes
+  void compact(Node *N) {
+    for (auto &&Child : N->Children) {
+      compact(Child.get());
+    }
+    if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
+        (N->Parent->Name.size() + N->Name.size() <= 32)) {
+      N->Parent->Value = N->Value;
+      N->Parent->Name += N->Name;
+      N->Parent->Children = std::move(N->Children);
+      for (std::unique_ptr<Node> &c : N->Parent->Children) {
+        c->Parent = N->Parent;
+      }
+    }
+  }
+  struct Node {
+    Node(std::string Name, Node *Parent = nullptr)
+        : Name(Name), Parent(Parent) {}
+
+    std::vector<std::unique_ptr<Node>> Children;
+    std::string Name;
+    Node *Parent = nullptr;
+    llvm::Optional<char32_t> Value;
+  };
+
+  std::unique_ptr<Node> Root = std::make_unique<Node>("");
+};
+
+extern const char *UnicodeLicense;
+
+int main(int argc, char **argv) {
+  printf("Unicode name -> codepoint mapping generator\n"
+         "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
+         argv[0]);
+  printf("NameAliases.txt can be found at "
+         "https://unicode.org/Public/14.0.0/ucd/NameAliases.txt\n"
+         "UnicodeData.txt can be found at "
+         "https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt\n\n");
+
+  if (argc != 4)
+    return EXIT_FAILURE;
+
+  FILE *Out = fopen(argv[3], "w");
+  if (!Out) {
+    printf("Error creating output file.\n");
+    return EXIT_FAILURE;
+  }
+
+  Trie T;
+  uint32_t NameCount = 0;
+  std::size_t LongestName = 0;
+  auto Entries = loadDataFiles(argv[1], argv[2]);
+  for (const std::pair<const char32_t, std::string> &Entry : Entries) {
+    char32_t Codepoint = Entry.first;
+    const std::string &Name = Entry.second;
+    // Ignore names which are not valid.
+    if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) {
+          return llvm::is_contained(Letters, C);
+        })) {
+      continue;
+    }
+    printf("%06x: %s\n", Codepoint, Name.c_str());
+    T.insert(Name, Codepoint);
+    LongestName =
+        std::max(LongestName, std::size_t(llvm::count_if(Name, [](char c) {
+                   return llvm::isAlnum(c);
+                 })));
+    NameCount++;
+  }
+  T.compact();
+
+  std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
+  const std::string &Dict = Data.first;
+  const std::vector<uint8_t> &Tree = Data.second;
+
+  fprintf(Out, R"(
+//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements mapping the name of a unicode code point to its value.
+//
+// This file was generated using %s.
+// Do not edit manually.
+//
+//===----------------------------------------------------------------------===//
+%s
+
+
+
+#include "llvm/Support/Compiler.h"
+#include <cstddef>
+#include <cstdint>
+)",
+          argv[0], UnicodeLicense);
+
+  fprintf(Out,
+          "namespace llvm { namespace sys { namespace unicode { \n"
+          "extern const char *UnicodeNameToCodepointDict;\n"
+          "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
+          "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
+          "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
+
+  fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
+          Dict.c_str());
+
+  fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%lu] = {\n",
+          Tree.size() + 1);
+
+  for (auto Byte : Tree) {
+    fprintf(Out, "0x%02x,", Byte);
+  }
+
+  fprintf(Out, "0};");
+  fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
+               "UnicodeNameToCodepointIndex_; \n");
+  fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %lu;\n",
+          Tree.size() + 1);
+  fprintf(Out,
+          "const std::size_t UnicodeNameToCodepointLargestNameSize = %lu;\n",
+          LongestName);
+  fprintf(Out, "\n}}}\n");
+  fclose(Out);
+  printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
+         argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
+}
+
+const char *UnicodeLicense = R"(
+/*
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+See Terms of Use <https://www.unicode.org/copyright.html>
+for definitions of Unicode Inc.’s Data Files and Software.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2022 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+*/
+)";