mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-17 18:16:42 +00:00
[Clang][C++23] P2071 Named universal character escapes
Implements [[ https://wg21.link/p2071r1 | P2071 Named Universal Character Escapes ]] - as an extension in all language mode, the patch not warn in c++23 mode will be done later once this paper is plenary approved (in July). We add * A code generator that transforms `UnicodeData.txt` and `NameAliases.txt` to a space efficient data structure that can be queried in `O(NameLength)` * A set of functions in `Unicode.h` to query that data, including * A function to find an exact match of a given Unicode character name * A function to perform a loose (ignoring case, space, underscore, medial hyphen) matching * A function returning the best matching codepoint for a given string per edit distance * Support of `\N{}` escape sequences in String and character Literals, with loose and typos diagnostics/fixits * Support of `\N{}` as UCN with loose matching diagnostics/fixits. Loose matching is considered an error to match closely the semantics of P2071. The generated data contributes to 280kB of data to the binaries. `UnicodeData.txt` and `NameAliases.txt` are not committed to the repository in this patch, and regenerating the data is a manual process. Reviewed By: tahonermann Differential Revision: https://reviews.llvm.org/D123064
This commit is contained in:
parent
f8c1c9afd3
commit
c92056d038
@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
|
||||
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
|
||||
|
||||
def ext_delimited_escape_sequence : Extension<
|
||||
"delimited escape sequences are a Clang extension">,
|
||||
"%select{delimited|named}0 escape sequences are a Clang extension">,
|
||||
InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
|
||||
def err_delimited_escape_empty : Error<
|
||||
"delimited escape sequence cannot be empty">;
|
||||
@ -138,6 +138,13 @@ def err_delimited_escape_invalid : Error<
|
||||
"invalid digit '%0' in escape sequence">;
|
||||
def err_hex_escape_no_digits : Error<
|
||||
"\\%0 used with no following hex digits">;
|
||||
def err_invalid_ucn_name : Error<
|
||||
"'%0' is not a valid Unicode character name">;
|
||||
def note_invalid_ucn_name_loose_matching : Note<
|
||||
"characters names in Unicode escape sequences are sensitive to case and whitespaces">;
|
||||
def note_invalid_ucn_name_candidate : Note<
|
||||
"did you mean %0 ('%2' U+%1)?">;
|
||||
|
||||
def warn_ucn_escape_no_digits : Warning<
|
||||
"\\%0 used with no following hex digits; "
|
||||
"treating as '\\' followed by identifier">, InGroup<Unicode>;
|
||||
@ -145,10 +152,10 @@ def err_ucn_escape_incomplete : Error<
|
||||
"incomplete universal character name">;
|
||||
def warn_delimited_ucn_incomplete : Warning<
|
||||
"incomplete delimited universal character name; "
|
||||
"treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
|
||||
"treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
|
||||
def warn_delimited_ucn_empty : Warning<
|
||||
"empty delimited universal character name; "
|
||||
"treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
|
||||
"treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
|
||||
def warn_ucn_escape_incomplete : Warning<
|
||||
"incomplete universal character name; "
|
||||
"treating as '\\' followed by identifier">, InGroup<Unicode>;
|
||||
|
@ -769,6 +769,11 @@ private:
|
||||
void codeCompleteIncludedFile(const char *PathStart,
|
||||
const char *CompletionPoint, bool IsAngled);
|
||||
|
||||
llvm::Optional<uint32_t>
|
||||
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
|
||||
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
|
||||
Token *Result);
|
||||
|
||||
/// Read a universal character name.
|
||||
///
|
||||
/// \param StartPtr The position in the source buffer after the initial '\'.
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include "llvm/Support/MemoryBufferRef.h"
|
||||
#include "llvm/Support/NativeFormatting.h"
|
||||
#include "llvm/Support/Unicode.h"
|
||||
#include "llvm/Support/UnicodeCharRanges.h"
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
@ -3119,27 +3120,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
Token *Result) {
|
||||
llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
|
||||
const char *SlashLoc,
|
||||
Token *Result) {
|
||||
unsigned CharSize;
|
||||
char Kind = getCharAndSize(StartPtr, CharSize);
|
||||
bool Delimited = false;
|
||||
bool FoundEndDelimiter = false;
|
||||
unsigned Count = 0;
|
||||
bool Diagnose = Result && !isLexingRawMode();
|
||||
assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
|
||||
|
||||
unsigned NumHexDigits;
|
||||
if (Kind == 'u')
|
||||
NumHexDigits = 4;
|
||||
else if (Kind == 'U')
|
||||
NumHexDigits = 8;
|
||||
else
|
||||
return 0;
|
||||
|
||||
bool Delimited = false;
|
||||
bool FoundEndDelimiter = false;
|
||||
unsigned Count = 0;
|
||||
bool Diagnose = Result && !isLexingRawMode();
|
||||
|
||||
if (!LangOpts.CPlusPlus && !LangOpts.C99) {
|
||||
if (Diagnose)
|
||||
Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
|
||||
return 0;
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
const char *CurPtr = StartPtr + CharSize;
|
||||
@ -3166,14 +3168,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
break;
|
||||
if (Diagnose)
|
||||
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
|
||||
<< StringRef(&C, 1);
|
||||
return 0;
|
||||
<< StringRef(KindLoc, 1);
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
if (CodePoint & 0xF000'0000) {
|
||||
if (Diagnose)
|
||||
Diag(KindLoc, diag::err_escape_too_large) << 0;
|
||||
return 0;
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
CodePoint <<= 4;
|
||||
@ -3187,7 +3189,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
|
||||
: diag::warn_ucn_escape_no_digits)
|
||||
<< StringRef(KindLoc, 1);
|
||||
return 0;
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
if (Delimited && Kind == 'U') {
|
||||
if (Diagnose)
|
||||
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
if (!Delimited && Count != NumHexDigits) {
|
||||
@ -3200,11 +3208,11 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
<< FixItHint::CreateReplacement(URange, "u");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
if (Delimited && PP) {
|
||||
Diag(BufferPtr, diag::ext_delimited_escape_sequence);
|
||||
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
|
||||
}
|
||||
|
||||
if (Result) {
|
||||
@ -3217,6 +3225,110 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
} else {
|
||||
StartPtr = CurPtr;
|
||||
}
|
||||
return CodePoint;
|
||||
}
|
||||
|
||||
llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
|
||||
Token *Result) {
|
||||
unsigned CharSize;
|
||||
bool Diagnose = Result && !isLexingRawMode();
|
||||
|
||||
char C = getCharAndSize(StartPtr, CharSize);
|
||||
assert(C == 'N' && "expected \\N{...}");
|
||||
|
||||
const char *CurPtr = StartPtr + CharSize;
|
||||
const char *KindLoc = &CurPtr[-1];
|
||||
|
||||
C = getCharAndSize(CurPtr, CharSize);
|
||||
if (C != '{') {
|
||||
if (Diagnose)
|
||||
Diag(StartPtr, diag::warn_ucn_escape_incomplete);
|
||||
return llvm::None;
|
||||
}
|
||||
CurPtr += CharSize;
|
||||
const char *StartName = CurPtr;
|
||||
bool FoundEndDelimiter = false;
|
||||
llvm::SmallVector<char, 30> Buffer;
|
||||
while (C) {
|
||||
C = getCharAndSize(CurPtr, CharSize);
|
||||
CurPtr += CharSize;
|
||||
if (C == '}') {
|
||||
FoundEndDelimiter = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
|
||||
break;
|
||||
Buffer.push_back(C);
|
||||
}
|
||||
|
||||
if (!FoundEndDelimiter || Buffer.empty()) {
|
||||
if (Diagnose)
|
||||
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
|
||||
: diag::warn_delimited_ucn_incomplete)
|
||||
<< StringRef(KindLoc, 1);
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
StringRef Name(Buffer.data(), Buffer.size());
|
||||
llvm::Optional<char32_t> Res =
|
||||
llvm::sys::unicode::nameToCodepointStrict(Name);
|
||||
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
|
||||
if (!Res) {
|
||||
if (!isLexingRawMode()) {
|
||||
Diag(StartPtr, diag::err_invalid_ucn_name)
|
||||
<< StringRef(Buffer.data(), Buffer.size());
|
||||
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
|
||||
if (LooseMatch) {
|
||||
Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
|
||||
<< FixItHint::CreateReplacement(
|
||||
makeCharRange(*this, StartName, CurPtr - CharSize),
|
||||
LooseMatch->Name);
|
||||
}
|
||||
}
|
||||
// When finding a match using Unicode loose matching rules
|
||||
// recover after having emitted a diagnostic.
|
||||
if (!LooseMatch)
|
||||
return llvm::None;
|
||||
// We do not offer missspelled character names suggestions here
|
||||
// as the set of what would be a valid suggestion depends on context,
|
||||
// and we should not make invalid suggestions.
|
||||
}
|
||||
|
||||
if (Diagnose && PP && !LooseMatch)
|
||||
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
|
||||
|
||||
if (LooseMatch)
|
||||
Res = LooseMatch->CodePoint;
|
||||
|
||||
if (Result) {
|
||||
Result->setFlag(Token::HasUCN);
|
||||
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
|
||||
StartPtr = CurPtr;
|
||||
else
|
||||
while (StartPtr != CurPtr)
|
||||
(void)getAndAdvanceChar(StartPtr, *Result);
|
||||
} else {
|
||||
StartPtr = CurPtr;
|
||||
}
|
||||
return *Res;
|
||||
}
|
||||
|
||||
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
|
||||
Token *Result) {
|
||||
|
||||
unsigned CharSize;
|
||||
llvm::Optional<uint32_t> CodePointOpt;
|
||||
char Kind = getCharAndSize(StartPtr, CharSize);
|
||||
if (Kind == 'u' || Kind == 'U')
|
||||
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
|
||||
else if (Kind == 'N')
|
||||
CodePointOpt = tryReadNamedUCN(StartPtr, Result);
|
||||
|
||||
if (!CodePointOpt)
|
||||
return 0;
|
||||
|
||||
uint32_t CodePoint = *CodePointOpt;
|
||||
|
||||
// Don't apply C family restrictions to UCNs in assembly mode
|
||||
if (LangOpts.AsmPreprocessor)
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/Unicode.h"
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
@ -233,7 +234,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
|
||||
HadError = true;
|
||||
if (Diags)
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
||||
diag::err_delimited_escape_missing_brace);
|
||||
diag::err_delimited_escape_missing_brace)
|
||||
<< "o";
|
||||
|
||||
break;
|
||||
}
|
||||
@ -309,7 +311,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
|
||||
<< tok::r_brace;
|
||||
else if (!HadError) {
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
|
||||
diag::ext_delimited_escape_sequence);
|
||||
diag::ext_delimited_escape_sequence)
|
||||
<< /*delimited*/ 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -335,7 +338,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
|
||||
char Kind = *I;
|
||||
++I;
|
||||
|
||||
assert(Kind == 'u' || Kind == 'U');
|
||||
assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
|
||||
uint32_t CodePoint = 0;
|
||||
|
||||
if (Kind == 'u' && *I == '{') {
|
||||
@ -349,6 +352,22 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Kind == 'N') {
|
||||
assert(*I == '{');
|
||||
++I;
|
||||
auto Delim = std::find(I, Input.end(), '}');
|
||||
assert(Delim != Input.end());
|
||||
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
|
||||
llvm::sys::unicode::nameToCodepointLooseMatching(
|
||||
StringRef(I, std::distance(I, Delim)));
|
||||
assert(Res);
|
||||
CodePoint = Res->CodePoint;
|
||||
assert(CodePoint != 0xFFFFFFFF);
|
||||
appendCodePoint(CodePoint, Buf);
|
||||
I = Delim;
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned NumHexDigits;
|
||||
if (Kind == 'u')
|
||||
NumHexDigits = 4;
|
||||
@ -370,23 +389,20 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
|
||||
}
|
||||
}
|
||||
|
||||
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
|
||||
/// return the UTF32.
|
||||
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
||||
const char *ThisTokEnd,
|
||||
uint32_t &UcnVal, unsigned short &UcnLen,
|
||||
FullSourceLoc Loc, DiagnosticsEngine *Diags,
|
||||
const LangOptions &Features,
|
||||
bool in_char_string_literal = false) {
|
||||
static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
|
||||
const char *&ThisTokBuf,
|
||||
const char *ThisTokEnd, uint32_t &UcnVal,
|
||||
unsigned short &UcnLen, bool &Delimited,
|
||||
FullSourceLoc Loc, DiagnosticsEngine *Diags,
|
||||
const LangOptions &Features,
|
||||
bool in_char_string_literal = false) {
|
||||
const char *UcnBegin = ThisTokBuf;
|
||||
bool HasError = false;
|
||||
bool EndDelimiterFound = false;
|
||||
|
||||
// Skip the '\u' char's.
|
||||
ThisTokBuf += 2;
|
||||
|
||||
bool Delimited = false;
|
||||
bool EndDelimiterFound = false;
|
||||
bool HasError = false;
|
||||
|
||||
Delimited = false;
|
||||
if (UcnBegin[1] == 'u' && in_char_string_literal &&
|
||||
ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
|
||||
Delimited = true;
|
||||
@ -394,7 +410,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
||||
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
|
||||
if (Diags)
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
||||
diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
|
||||
diag::err_hex_escape_no_digits)
|
||||
<< StringRef(&ThisTokBuf[-1], 1);
|
||||
return false;
|
||||
}
|
||||
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
|
||||
@ -455,7 +472,136 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
||||
: diag::err_ucn_escape_incomplete);
|
||||
return false;
|
||||
}
|
||||
return !HasError;
|
||||
}
|
||||
|
||||
static void DiagnoseInvalidUnicodeCharacterName(
|
||||
DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
|
||||
const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
|
||||
llvm::StringRef Name) {
|
||||
|
||||
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
|
||||
diag::err_invalid_ucn_name)
|
||||
<< Name;
|
||||
|
||||
namespace u = llvm::sys::unicode;
|
||||
|
||||
llvm::Optional<u::LooseMatchingResult> Res =
|
||||
u::nameToCodepointLooseMatching(Name);
|
||||
if (Res) {
|
||||
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
|
||||
diag::note_invalid_ucn_name_loose_matching)
|
||||
<< FixItHint::CreateReplacement(
|
||||
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
|
||||
TokRangeEnd),
|
||||
Res->Name);
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned Distance = 0;
|
||||
SmallVector<u::MatchForCodepointName> Matches =
|
||||
u::nearestMatchesForCodepointName(Name, 5);
|
||||
assert(!Matches.empty() && "No unicode characters found");
|
||||
|
||||
for (const auto &Match : Matches) {
|
||||
if (Distance == 0)
|
||||
Distance = Match.Distance;
|
||||
if (std::max(Distance, Match.Distance) -
|
||||
std::min(Distance, Match.Distance) >
|
||||
3)
|
||||
break;
|
||||
Distance = Match.Distance;
|
||||
|
||||
std::string Str;
|
||||
llvm::UTF32 V = Match.Value;
|
||||
LLVM_ATTRIBUTE_UNUSED bool Converted =
|
||||
llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
|
||||
assert(Converted && "Found a match wich is not a unicode character");
|
||||
|
||||
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
|
||||
diag::note_invalid_ucn_name_candidate)
|
||||
<< Match.Name << llvm::utohexstr(Match.Value)
|
||||
<< Str // FIXME: Fix the rendering of non printable characters
|
||||
<< FixItHint::CreateReplacement(
|
||||
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
|
||||
TokRangeEnd),
|
||||
Match.Name);
|
||||
}
|
||||
}
|
||||
|
||||
static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
|
||||
const char *&ThisTokBuf,
|
||||
const char *ThisTokEnd, uint32_t &UcnVal,
|
||||
unsigned short &UcnLen, FullSourceLoc Loc,
|
||||
DiagnosticsEngine *Diags,
|
||||
const LangOptions &Features) {
|
||||
const char *UcnBegin = ThisTokBuf;
|
||||
assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
|
||||
ThisTokBuf += 2;
|
||||
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
|
||||
if (Diags) {
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
||||
diag::err_delimited_escape_missing_brace)
|
||||
<< StringRef(&ThisTokBuf[-1], 1);
|
||||
}
|
||||
ThisTokBuf++;
|
||||
return false;
|
||||
}
|
||||
ThisTokBuf++;
|
||||
const char *ClosingBrace =
|
||||
std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
|
||||
return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
|
||||
});
|
||||
bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
|
||||
bool Empty = ClosingBrace == ThisTokBuf;
|
||||
if (Incomplete || Empty) {
|
||||
if (Diags) {
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
||||
Incomplete ? diag::err_ucn_escape_incomplete
|
||||
: diag::err_delimited_escape_empty)
|
||||
<< StringRef(&UcnBegin[1], 1);
|
||||
}
|
||||
ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
|
||||
return false;
|
||||
}
|
||||
StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
|
||||
ThisTokBuf = ClosingBrace + 1;
|
||||
llvm::Optional<char32_t> Res =
|
||||
llvm::sys::unicode::nameToCodepointStrict(Name);
|
||||
if (!Res) {
|
||||
if (Diags)
|
||||
DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
|
||||
&UcnBegin[3], ClosingBrace, Name);
|
||||
return false;
|
||||
}
|
||||
UcnVal = *Res;
|
||||
UcnLen = UcnVal > 0xFFFF ? 8 : 4;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
|
||||
/// return the UTF32.
|
||||
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
||||
const char *ThisTokEnd, uint32_t &UcnVal,
|
||||
unsigned short &UcnLen, FullSourceLoc Loc,
|
||||
DiagnosticsEngine *Diags,
|
||||
const LangOptions &Features,
|
||||
bool in_char_string_literal = false) {
|
||||
|
||||
bool HasError;
|
||||
const char *UcnBegin = ThisTokBuf;
|
||||
bool IsDelimitedEscapeSequence = false;
|
||||
bool IsNamedEscapeSequence = false;
|
||||
if (ThisTokBuf[1] == 'N') {
|
||||
IsNamedEscapeSequence = true;
|
||||
HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
|
||||
UcnVal, UcnLen, Loc, Diags, Features);
|
||||
} else {
|
||||
HasError =
|
||||
!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
|
||||
UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
|
||||
Features, in_char_string_literal);
|
||||
}
|
||||
if (HasError)
|
||||
return false;
|
||||
|
||||
@ -493,9 +639,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
||||
diag::warn_ucn_not_valid_in_c89_literal);
|
||||
|
||||
if (Delimited && Diags)
|
||||
if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
|
||||
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
|
||||
diag::ext_delimited_escape_sequence);
|
||||
diag::ext_delimited_escape_sequence)
|
||||
<< (IsNamedEscapeSequence ? 1 : 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1559,7 +1706,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
|
||||
continue;
|
||||
}
|
||||
// Is this a Universal Character Name escape?
|
||||
if (begin[1] == 'u' || begin[1] == 'U') {
|
||||
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
|
||||
unsigned short UcnLen = 0;
|
||||
if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
|
||||
FullSourceLoc(Loc, PP.getSourceManager()),
|
||||
@ -1919,7 +2066,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
|
||||
continue;
|
||||
}
|
||||
// Is this a Universal Character Name escape?
|
||||
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
|
||||
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
|
||||
ThisTokBuf[1] == 'N') {
|
||||
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
|
||||
ResultPtr, hadError,
|
||||
FullSourceLoc(StringToks[i].getLocation(), SM),
|
||||
@ -2112,7 +2260,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
|
||||
|
||||
// Otherwise, this is an escape character. Advance over it.
|
||||
bool HadError = false;
|
||||
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
|
||||
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
|
||||
SpellingPtr[1] == 'N') {
|
||||
const char *EscapePtr = SpellingPtr;
|
||||
unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
|
||||
1, Features, HadError);
|
||||
|
29
clang/test/FixIt/fixit-unicode-named-escape-sequences.c
Normal file
29
clang/test/FixIt/fixit-unicode-named-escape-sequences.c
Normal file
@ -0,0 +1,29 @@
|
||||
// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck -check-prefix=CHECK-MACHINE %s
|
||||
const char*
|
||||
\N{GREEK_SMALL_LETTER-OMICRON} = // expected-error {{'GREEK_SMALL_LETTER-OMICRON' is not a valid Unicode character name}} \
|
||||
// expected-note {{sensitive to case and whitespaces}}
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:4-[[@LINE-2]]:30}:"GREEK SMALL LETTER OMICRON"
|
||||
|
||||
"\N{zero width no break space}" // expected-error {{'zero width no break space' is not a valid Unicode character name}} \
|
||||
// expected-note {{sensitive to case and whitespaces}}
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:30}:"ZERO WIDTH NO-BREAK SPACE"
|
||||
|
||||
"abc\N{MAN IN A BUSINESS SUIT LEVITATING}" // expected-error {{'MAN IN A BUSINESS SUIT LEVITATING' is not a valid Unicode character name}} \
|
||||
// expected-note {{did you mean MAN IN BUSINESS SUIT LEVITATING ('🕴' U+1F574)?}}
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:41}:"MAN IN BUSINESS SUIT LEVITATING"
|
||||
|
||||
"\N{AAA}" // expected-error {{'AAA' is not a valid Unicode character name}} \
|
||||
// expected-note 5{{did you mean}}
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:8}:"ANT"
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-3]]:5-[[@LINE-3]]:8}:"ARC"
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-4]]:5-[[@LINE-4]]:8}:"AXE"
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-5]]:5-[[@LINE-5]]:8}:"BAT"
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-6]]:5-[[@LINE-6]]:8}:"CAT"
|
||||
|
||||
"\N{BLACKCHESSBISHOP}" // expected-error {{'BLACKCHESSBISHOP' is not a valid Unicode character name}} \
|
||||
// expected-note {{sensitive to case and whitespaces}}
|
||||
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:21}:"BLACK CHESS BISHOP"
|
||||
|
||||
;
|
||||
|
||||
|
@ -2,17 +2,20 @@
|
||||
// RUN: %clang_cc1 -x c -std=gnu11 -fsyntax-only -pedantic -verify %s
|
||||
// RUN: %clang_cc1 -x c++ -std=gnu++11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
|
||||
// RUN: %clang_cc1 -x c -std=gnu11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
|
||||
// RUN: %clang_cc1 -x c++ -std=c++17 -ftrigraphs -fsyntax-only -pedantic -verify -DTRIGRAPHS=1 %s
|
||||
|
||||
const char *errors =
|
||||
"\u{}" //expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\u{" //expected-error {{expected '}'}}
|
||||
"\u{h}" //expected-error {{invalid digit 'h' in escape sequence}}
|
||||
"\x{}" //expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\x{" //expected-error {{expected '}'}}
|
||||
"\x{h}" //expected-error {{invalid digit 'h' in escape sequence}}
|
||||
"\o{}" //expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\o{" //expected-error {{expected '}'}}
|
||||
"\o{8}" //expected-error {{invalid digit '8' in escape sequence}}
|
||||
"\u{}" // expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\u{" // expected-error {{expected '}'}}
|
||||
"\u{h}" // expected-error {{invalid digit 'h' in escape sequence}}
|
||||
"\x{}" // expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\x{" // expected-error {{expected '}'}}
|
||||
"\x{h}" // expected-error {{invalid digit 'h' in escape sequence}}
|
||||
"\o{}" // expected-error {{delimited escape sequence cannot be empty}}
|
||||
"\o{" // expected-error {{expected '}'}}
|
||||
"\o" // expected-error {{expected '{' after '\o' escape sequence}}
|
||||
"\o{8}" // expected-error {{invalid digit '8' in escape sequence}}
|
||||
"\U{8}" // expected-error {{\U used with no following hex digits}}
|
||||
;
|
||||
|
||||
void ucn(void) {
|
||||
@ -70,6 +73,30 @@ void concat(void) {
|
||||
(void)"\o{12" "}"; // expected-error {{expected '}'}}
|
||||
}
|
||||
|
||||
void named(void) {
|
||||
char a = '\N{LOTUS}'; // expected-error{{character too large for enclosing character literal type}} \
|
||||
// expected-warning {{extension}}
|
||||
|
||||
char b = '\N{DOLLAR SIGN}'; // expected-warning {{extension}}
|
||||
char b_ = '\N{ DOL-LAR _SIGN }'; // expected-error {{' DOL-LAR _SIGN ' is not a valid Unicode character name}} \
|
||||
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
|
||||
|
||||
char c = '\N{NOTATHING}'; // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
|
||||
// expected-note 5{{did you mean}}
|
||||
char d = '\N{}'; // expected-error {{delimited escape sequence cannot be empty}}
|
||||
char e = '\N{'; // expected-error {{incomplete universal character name}}
|
||||
|
||||
unsigned f = L'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
|
||||
|
||||
unsigned g = u'\N{LOTUS}'; // expected-error {{character too large for enclosing character literal type}} \
|
||||
// expected-warning {{extension}}
|
||||
|
||||
unsigned h = U'\N{LOTUS}'; // expected-warning {{extension}}
|
||||
unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
|
||||
char j = '\NN'; // expected-error {{expected '{' after '\N' escape sequence}}
|
||||
unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}}
|
||||
}
|
||||
|
||||
void separators(void) {
|
||||
(void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
|
||||
(void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
|
||||
@ -79,3 +106,12 @@ void separators(void) {
|
||||
// expected-error@-1 2{{expected ';'}}
|
||||
// expected-warning@-2 3{{expression result unused}}
|
||||
}
|
||||
|
||||
#if L'\N{GREEK CAPITAL LETTER GAMMA}' != L'Γ' // expected-warning {{extension}}
|
||||
#error "oh no!"
|
||||
#endif
|
||||
|
||||
#ifdef TRIGRAPHS
|
||||
static_assert('\N??<DOLLAR SIGN??>' == '$'); // expected-warning 2{{trigraph converted}} \
|
||||
// expected-warning {{named escape sequences are a Clang extension}}
|
||||
#endif
|
||||
|
@ -39,9 +39,14 @@ extern int 𐠈;
|
||||
extern int ꙮ;
|
||||
extern int \u1B4C; // BALINESE LETTER ARCHAIC JNYA - Added in Unicode 14
|
||||
extern int \U00016AA2; // TANGSA LETTER GA - Added in Unicode 14
|
||||
extern int _\N{TANGSA LETTER GA};
|
||||
extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
|
||||
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
|
||||
|
||||
|
||||
|
||||
// This character doesn't have the XID_Start property
|
||||
extern int \U00016AC0; // TANGSA DIGIT ZERO // expected-error {{expected unqualified-id}}
|
||||
extern int _\U00016AC0; // TANGSA DIGIT ZERO
|
||||
|
||||
extern int 🌹; // expected-error {{unexpected character <U+1F339>}} \
|
||||
expected-warning {{declaration does not declare anything}}
|
||||
|
@ -131,6 +131,7 @@ int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t);
|
||||
int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t);
|
||||
|
||||
int operator""_\u{212f}(char);
|
||||
int operator""_\N{SCRIPT SMALL E}(char);
|
||||
|
||||
int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e"";
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
|
||||
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
|
||||
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef -ftrigraphs -DTRIGRAPHS=1
|
||||
// RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s
|
||||
|
||||
#define \u00FC
|
||||
@ -29,9 +30,14 @@
|
||||
|
||||
// Make sure we reject disallowed UCNs
|
||||
#define \ufffe // expected-error {{macro name must be an identifier}}
|
||||
#define \U10000000 // expected-error {{macro name must be an identifier}}
|
||||
#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
|
||||
#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
|
||||
#define \U10000000 // expected-error {{macro name must be an identifier}}
|
||||
#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
|
||||
#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
|
||||
#define \N{ALERT} // expected-error {{universal character name refers to a control character}} \
|
||||
// expected-error {{macro name must be an identifier}} \
|
||||
// expected-warning {{Clang extension}}
|
||||
#define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
|
||||
// expected-warning {{Clang extension}}
|
||||
|
||||
#define a\u0024
|
||||
|
||||
@ -113,3 +119,20 @@ C 1
|
||||
#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
|
||||
#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
|
||||
#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
|
||||
#define \N{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}}
|
||||
#define \N{} // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}}
|
||||
#define \N{NOTATHING} // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
|
||||
// expected-error {{macro name must be an identifier}}
|
||||
#define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
|
||||
#define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
|
||||
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
|
||||
|
||||
#define CONCAT(A, B) A##B
|
||||
int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \
|
||||
// expected-warning {{incomplete delimited universal character name}}
|
||||
|
||||
#ifdef TRIGRAPHS
|
||||
int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // expected-warning{{amed escape sequences are a Clang extension}} \
|
||||
// expected-warning 2{{trigraph converted}}
|
||||
|
||||
#endif
|
||||
|
@ -18,6 +18,7 @@ void goodCalls(void) {
|
||||
über(2);
|
||||
\U000000FCber(3);
|
||||
\u{FC}ber(4); // expected-warning {{Clang extension}}
|
||||
\N{LATIN SMALL LETTER U WITH DIAERESIS}ber(4); // expected-warning {{Clang extension}}
|
||||
}
|
||||
|
||||
void badCalls(void) {
|
||||
|
@ -1107,6 +1107,7 @@ if( LLVM_INCLUDE_UTILS )
|
||||
add_subdirectory(utils/PerfectShuffle)
|
||||
add_subdirectory(utils/count)
|
||||
add_subdirectory(utils/not)
|
||||
add_subdirectory(utils/UnicodeData)
|
||||
add_subdirectory(utils/yaml-bench)
|
||||
else()
|
||||
if ( LLVM_INCLUDE_TESTS )
|
||||
|
@ -14,6 +14,10 @@
|
||||
#ifndef LLVM_SUPPORT_UNICODE_H
|
||||
#define LLVM_SUPPORT_UNICODE_H
|
||||
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/SmallString.h"
|
||||
#include <string>
|
||||
|
||||
namespace llvm {
|
||||
class StringRef;
|
||||
|
||||
@ -63,6 +67,30 @@ int columnWidthUTF8(StringRef Text);
|
||||
/// rules.
|
||||
int foldCharSimple(int C);
|
||||
|
||||
/// Maps the name or the alias of a Unicode character to its associated
|
||||
/// codepoints.
|
||||
/// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
|
||||
/// For compatibility with the semantics of named character escape sequences in
|
||||
/// C++, this mapping does an exact match sensitive to casing and spacing.
|
||||
/// \return The codepoint of the corresponding character, if any.
|
||||
Optional<char32_t> nameToCodepointStrict(StringRef Name);
|
||||
|
||||
struct LooseMatchingResult {
|
||||
char32_t CodePoint;
|
||||
SmallString<64> Name;
|
||||
};
|
||||
|
||||
Optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
|
||||
|
||||
struct MatchForCodepointName {
|
||||
std::string Name;
|
||||
uint32_t Distance = 0;
|
||||
char32_t Value = 0;
|
||||
};
|
||||
|
||||
SmallVector<MatchForCodepointName>
|
||||
nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
|
||||
|
||||
} // namespace unicode
|
||||
} // namespace sys
|
||||
} // namespace llvm
|
||||
|
@ -221,6 +221,8 @@ add_llvm_component_library(LLVMSupport
|
||||
TypeSize.cpp
|
||||
Unicode.cpp
|
||||
UnicodeCaseFold.cpp
|
||||
UnicodeNameToCodepoint.cpp
|
||||
UnicodeNameToCodepointGenerated.cpp
|
||||
VersionTuple.cpp
|
||||
VirtualFileSystem.cpp
|
||||
WithColor.cpp
|
||||
|
551
llvm/lib/Support/UnicodeNameToCodepoint.cpp
Normal file
551
llvm/lib/Support/UnicodeNameToCodepoint.cpp
Normal file
@ -0,0 +1,551 @@
|
||||
//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
|
||||
//-*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements functions to map the name or alias of a unicode
|
||||
// character to its codepoint.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Unicode.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace sys {
|
||||
namespace unicode {
|
||||
|
||||
extern const char *UnicodeNameToCodepointDict;
|
||||
extern const uint8_t *UnicodeNameToCodepointIndex;
|
||||
extern const std::size_t UnicodeNameToCodepointIndexSize;
|
||||
extern const std::size_t UnicodeNameToCodepointLargestNameSize;
|
||||
|
||||
using BufferType = SmallString<64>;
|
||||
|
||||
struct Node {
|
||||
bool IsRoot = false;
|
||||
char32_t Value = 0xFFFFFFFF;
|
||||
uint32_t ChildrenOffset = 0;
|
||||
bool HasSibling = false;
|
||||
uint32_t Size = 0;
|
||||
StringRef Name;
|
||||
const Node *Parent = nullptr;
|
||||
|
||||
constexpr bool isValid() const {
|
||||
return !Name.empty() || Value == 0xFFFFFFFF;
|
||||
}
|
||||
constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
|
||||
|
||||
std::string fullName() const {
|
||||
std::string S;
|
||||
// Reserve enough space for most unicode code points.
|
||||
// The chosen value represent the 99th percentile of name size as of
|
||||
// Unicode 14.
|
||||
S.reserve(46);
|
||||
const Node *N = this;
|
||||
while (N) {
|
||||
std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
|
||||
N = N->Parent;
|
||||
}
|
||||
std::reverse(S.begin(), S.end());
|
||||
return S;
|
||||
}
|
||||
};
|
||||
|
||||
static Node createRoot() {
|
||||
Node N;
|
||||
N.IsRoot = true;
|
||||
N.ChildrenOffset = 1;
|
||||
N.Size = 1;
|
||||
return N;
|
||||
}
|
||||
|
||||
static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
|
||||
if (Offset == 0)
|
||||
return createRoot();
|
||||
|
||||
uint32_t Origin = Offset;
|
||||
Node N;
|
||||
N.Parent = Parent;
|
||||
uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
|
||||
if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
|
||||
return N;
|
||||
|
||||
bool LongName = NameInfo & 0x40;
|
||||
bool HasValue = NameInfo & 0x80;
|
||||
std::size_t Size = NameInfo & ~0xC0;
|
||||
if (LongName) {
|
||||
uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
|
||||
NameOffset |= UnicodeNameToCodepointIndex[Offset++];
|
||||
N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
|
||||
} else {
|
||||
N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
|
||||
}
|
||||
if (HasValue) {
|
||||
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
|
||||
uint8_t M = UnicodeNameToCodepointIndex[Offset++];
|
||||
uint8_t L = UnicodeNameToCodepointIndex[Offset++];
|
||||
N.Value = ((H << 16) | (M << 8) | L) >> 3;
|
||||
|
||||
bool HasChildren = L & 0x02;
|
||||
N.HasSibling = L & 0x01;
|
||||
|
||||
if (HasChildren) {
|
||||
N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
|
||||
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
|
||||
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
|
||||
}
|
||||
} else {
|
||||
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
|
||||
N.HasSibling = H & 0x80;
|
||||
bool HasChildren = H & 0x40;
|
||||
H &= ~0xC0;
|
||||
if (HasChildren) {
|
||||
N.ChildrenOffset = (H << 16);
|
||||
N.ChildrenOffset |=
|
||||
(uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
|
||||
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
|
||||
}
|
||||
}
|
||||
N.Size = Offset - Origin;
|
||||
return N;
|
||||
}
|
||||
|
||||
static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
|
||||
std::size_t &Consummed, char &PreviousCharInName,
|
||||
char &PreviousCharInNeedle, bool IsPrefix = false) {
|
||||
|
||||
Consummed = 0;
|
||||
if (Strict) {
|
||||
if (!Name.startswith(Needle))
|
||||
return false;
|
||||
Consummed = Needle.size();
|
||||
return true;
|
||||
}
|
||||
if (Needle.empty())
|
||||
return true;
|
||||
|
||||
auto NamePos = Name.begin();
|
||||
auto NeedlePos = Needle.begin();
|
||||
|
||||
char PreviousCharInNameOrigin = PreviousCharInName;
|
||||
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
|
||||
|
||||
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
|
||||
bool IgnoreEnd = false) {
|
||||
while (It != End) {
|
||||
const auto Next = std::next(It);
|
||||
// Ignore spaces, underscore, medial hyphens
|
||||
// https://unicode.org/reports/tr44/#UAX44-LM2.
|
||||
bool Ignore =
|
||||
*It == ' ' || *It == '_' ||
|
||||
(*It == '-' && isAlnum(PreviousChar) &&
|
||||
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
|
||||
PreviousChar = *It;
|
||||
if (!Ignore)
|
||||
break;
|
||||
++It;
|
||||
}
|
||||
return It;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
|
||||
NeedlePos =
|
||||
IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
|
||||
if (NeedlePos == Needle.end())
|
||||
break;
|
||||
if (NamePos == Name.end())
|
||||
break;
|
||||
if (toUpper(*NeedlePos) != toUpper(*NamePos))
|
||||
break;
|
||||
NeedlePos++;
|
||||
NamePos++;
|
||||
}
|
||||
Consummed = std::distance(Name.begin(), NamePos);
|
||||
if (NeedlePos != Needle.end()) {
|
||||
PreviousCharInName = PreviousCharInNameOrigin;
|
||||
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
|
||||
}
|
||||
return NeedlePos == Needle.end();
|
||||
}
|
||||
|
||||
static std::tuple<Node, bool, uint32_t>
|
||||
compareNode(uint32_t Offset, StringRef Name, bool Strict,
|
||||
char PreviousCharInName, char PreviousCharInNeedle,
|
||||
BufferType &Buffer, const Node *Parent = nullptr) {
|
||||
Node N = readNode(Offset, Parent);
|
||||
std::size_t Consummed = 0;
|
||||
bool DoesStartWith =
|
||||
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
|
||||
PreviousCharInName, PreviousCharInNeedle);
|
||||
if (!DoesStartWith)
|
||||
return {N, false, 0};
|
||||
|
||||
if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
|
||||
return {N, true, N.Value};
|
||||
|
||||
if (N.hasChildren()) {
|
||||
uint32_t ChildOffset = N.ChildrenOffset;
|
||||
for (;;) {
|
||||
Node C;
|
||||
bool Matches;
|
||||
uint32_t Value;
|
||||
std::tie(C, Matches, Value) =
|
||||
compareNode(ChildOffset, Name.substr(Consummed), Strict,
|
||||
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
|
||||
if (Matches) {
|
||||
std::reverse_copy(C.Name.begin(), C.Name.end(),
|
||||
std::back_inserter(Buffer));
|
||||
return {N, true, Value};
|
||||
}
|
||||
ChildOffset += C.Size;
|
||||
if (!C.HasSibling)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return {N, false, 0};
|
||||
}
|
||||
|
||||
static std::tuple<Node, bool, uint32_t>
|
||||
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
constexpr const char *const HangulSyllables[][3] = {
|
||||
{ "G", "A", "" },
|
||||
{ "GG", "AE", "G" },
|
||||
{ "N", "YA", "GG" },
|
||||
{ "D", "YAE", "GS" },
|
||||
{ "DD", "EO", "N", },
|
||||
{ "R", "E", "NJ" },
|
||||
{ "M", "YEO", "NH" },
|
||||
{ "B", "YE", "D" },
|
||||
{ "BB", "O", "L" },
|
||||
{ "S", "WA", "LG" },
|
||||
{ "SS", "WAE", "LM" },
|
||||
{ "", "OE", "LB" },
|
||||
{ "J", "YO", "LS" },
|
||||
{ "JJ", "U", "LT" },
|
||||
{ "C", "WEO", "LP" },
|
||||
{ "K", "WE", "LH" },
|
||||
{ "T", "WI", "M" },
|
||||
{ "P", "YU", "B" },
|
||||
{ "H", "EU", "BS" },
|
||||
{ 0, "YI", "S" },
|
||||
{ 0, "I", "SS" },
|
||||
{ 0, 0, "NG" },
|
||||
{ 0, 0, "J" },
|
||||
{ 0, 0, "C" },
|
||||
{ 0, 0, "K" },
|
||||
{ 0, 0, "T" },
|
||||
{ 0, 0, "P" },
|
||||
{ 0, 0, "H" }
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
// Unicode 14.0
|
||||
// 3.12 Conjoining Jamo Behavior Common constants
|
||||
constexpr const char32_t SBase = 0xAC00;
|
||||
constexpr const uint32_t LCount = 19;
|
||||
constexpr const uint32_t VCount = 21;
|
||||
constexpr const uint32_t TCount = 28;
|
||||
|
||||
static std::size_t findSyllable(StringRef Name, bool Strict,
|
||||
char &PreviousInName, int &Pos, int Column) {
|
||||
assert(Column == 0 || Column == 1 || Column == 2);
|
||||
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
|
||||
char NeedleStart = 0;
|
||||
int Len = -1;
|
||||
int Prev = PreviousInName;
|
||||
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
|
||||
StringRef Syllable(HangulSyllables[I][Column]);
|
||||
if (int(Syllable.size()) <= Len)
|
||||
continue;
|
||||
std::size_t Consummed = 0;
|
||||
char PreviousInNameCopy = PreviousInName;
|
||||
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
|
||||
PreviousInNameCopy, NeedleStart);
|
||||
if (!DoesStartWith)
|
||||
continue;
|
||||
Len = Consummed;
|
||||
Pos = I;
|
||||
Prev = PreviousInNameCopy;
|
||||
}
|
||||
if (Len == -1)
|
||||
return 0;
|
||||
PreviousInName = Prev;
|
||||
return size_t(Len);
|
||||
}
|
||||
|
||||
static llvm::Optional<char32_t>
|
||||
nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
Buffer.clear();
|
||||
// Hangul Syllable Decomposition
|
||||
std::size_t Consummed = 0;
|
||||
char NameStart = 0, NeedleStart = 0;
|
||||
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
|
||||
NameStart, NeedleStart);
|
||||
if (!DoesStartWith)
|
||||
return None;
|
||||
Name = Name.substr(Consummed);
|
||||
int L = -1, V = -1, T = -1;
|
||||
Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
|
||||
Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
|
||||
Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
|
||||
if (L != -1 && V != -1 && T != -1 && Name.empty()) {
|
||||
if (!Strict) {
|
||||
Buffer.append("HANGUL SYLLABLE ");
|
||||
if (L != -1)
|
||||
Buffer.append(HangulSyllables[L][0]);
|
||||
if (V != -1)
|
||||
Buffer.append(HangulSyllables[V][1]);
|
||||
if (T != -1)
|
||||
Buffer.append(HangulSyllables[T][2]);
|
||||
}
|
||||
return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
|
||||
std::uint32_t(T);
|
||||
}
|
||||
// Otherwise, it's an illegal syllable name.
|
||||
return None;
|
||||
}
|
||||
|
||||
struct GeneratedNamesData {
|
||||
StringRef Prefix;
|
||||
uint32_t Start;
|
||||
uint32_t End;
|
||||
};
|
||||
|
||||
// Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
|
||||
// This needs to be kept in sync with
|
||||
// llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
|
||||
static const GeneratedNamesData GeneratedNamesDataTable[] = {
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
|
||||
{"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
|
||||
{"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
|
||||
{"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
|
||||
{"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
|
||||
{"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
|
||||
{"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
|
||||
{"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
|
||||
{"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
|
||||
};
|
||||
|
||||
static llvm::Optional<char32_t>
|
||||
nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
for (auto &&Item : GeneratedNamesDataTable) {
|
||||
Buffer.clear();
|
||||
std::size_t Consummed = 0;
|
||||
char NameStart = 0, NeedleStart = 0;
|
||||
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
|
||||
NameStart, NeedleStart, /*isPrefix*/ true);
|
||||
if (!DoesStartWith)
|
||||
continue;
|
||||
auto Number = Name.substr(Consummed);
|
||||
unsigned long long V = 0;
|
||||
// Be consistent about mandating upper casing.
|
||||
if (Strict &&
|
||||
llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
|
||||
return {};
|
||||
if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
|
||||
continue;
|
||||
if (!Strict) {
|
||||
Buffer.append(Item.Prefix);
|
||||
Buffer.append(utohexstr(V, true));
|
||||
}
|
||||
return V;
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
|
||||
BufferType &Buffer) {
|
||||
if (Name.empty())
|
||||
return None;
|
||||
|
||||
llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
|
||||
if (!Res)
|
||||
Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
|
||||
if (Res)
|
||||
return *Res;
|
||||
|
||||
Buffer.clear();
|
||||
Node Node;
|
||||
bool Matches;
|
||||
uint32_t Value;
|
||||
std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
|
||||
if (Matches) {
|
||||
std::reverse(Buffer.begin(), Buffer.end());
|
||||
// UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
|
||||
// hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
|
||||
if (!Strict && Value == 0x116c &&
|
||||
Name.find_insensitive("O-E") != StringRef::npos) {
|
||||
Buffer = "HANGUL JUNGSEONG O-E";
|
||||
Value = 0x1180;
|
||||
}
|
||||
return Value;
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
|
||||
|
||||
BufferType Buffer;
|
||||
auto Opt = nameToCodepoint(Name, true, Buffer);
|
||||
return Opt;
|
||||
}
|
||||
|
||||
llvm::Optional<LooseMatchingResult>
|
||||
nameToCodepointLooseMatching(StringRef Name) {
|
||||
BufferType Buffer;
|
||||
auto Opt = nameToCodepoint(Name, false, Buffer);
|
||||
if (!Opt)
|
||||
return None;
|
||||
return LooseMatchingResult{*Opt, Buffer};
|
||||
}
|
||||
|
||||
// Find the unicode character whose editing distance to Pattern
|
||||
// is shortest, using the Wagner–Fischer algorithm.
|
||||
llvm::SmallVector<MatchForCodepointName>
|
||||
nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
|
||||
// We maintain a fixed size vector of matches,
|
||||
// sorted by distance
|
||||
// The worst match (with the biggest distance) are discarded when new elements
|
||||
// are added.
|
||||
std::size_t LargestEditDistance = 0;
|
||||
llvm::SmallVector<MatchForCodepointName> Matches;
|
||||
Matches.reserve(MaxMatchesCount + 1);
|
||||
|
||||
auto Insert = [&](const Node &Node, uint32_t Distance,
|
||||
char32_t Value) -> bool {
|
||||
if (Distance > LargestEditDistance) {
|
||||
if (Matches.size() == MaxMatchesCount)
|
||||
return false;
|
||||
LargestEditDistance = Distance;
|
||||
}
|
||||
// To avoid allocations, the creation of the name is delayed
|
||||
// as much as possible.
|
||||
std::string Name;
|
||||
auto GetName = [&] {
|
||||
if (Name.empty())
|
||||
Name = Node.fullName();
|
||||
return Name;
|
||||
};
|
||||
|
||||
auto It = std::lower_bound(
|
||||
Matches.begin(), Matches.end(), Distance,
|
||||
[&](const MatchForCodepointName &a, std::size_t Distance) {
|
||||
if (Distance == a.Distance)
|
||||
return a.Name < GetName();
|
||||
return a.Distance < Distance;
|
||||
});
|
||||
if (It == Matches.end() && Matches.size() == MaxMatchesCount)
|
||||
return false;
|
||||
|
||||
MatchForCodepointName M{GetName(), Distance, Value};
|
||||
Matches.insert(It, std::move(M));
|
||||
if (Matches.size() > MaxMatchesCount)
|
||||
Matches.pop_back();
|
||||
return true;
|
||||
};
|
||||
|
||||
// We ignore case, space, hyphens, etc,
|
||||
// in both the search pattern and the prospective names.
|
||||
auto Normalize = [](StringRef Name) {
|
||||
std::string Out;
|
||||
Out.reserve(Name.size());
|
||||
for (char C : Name) {
|
||||
if (isAlnum(C))
|
||||
Out.push_back(toUpper(C));
|
||||
}
|
||||
return Out;
|
||||
};
|
||||
std::string NormalizedName = Normalize(Pattern);
|
||||
|
||||
// Allocate a matrix big enough for longest names.
|
||||
const std::size_t Columns =
|
||||
std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
|
||||
1;
|
||||
|
||||
LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
|
||||
UnicodeNameToCodepointLargestNameSize + 1;
|
||||
|
||||
std::vector<char> Distances(
|
||||
Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
|
||||
|
||||
auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
|
||||
assert(Column < Columns);
|
||||
assert(Row < Rows);
|
||||
return Distances[Row * Columns + Column];
|
||||
};
|
||||
|
||||
for (std::size_t I = 0; I < Columns; I++)
|
||||
Get(I, 0) = I;
|
||||
|
||||
// Visit the childrens,
|
||||
// Filling (and overriding) the matrix for the name fragment of each node
|
||||
// iteratively. CompleteName is used to collect the actual name of potential
|
||||
// match, respecting case and spacing.
|
||||
auto VisitNode = [&](const Node &N, std::size_t Row,
|
||||
auto &VisitNode) -> void {
|
||||
std::size_t J = 0;
|
||||
for (; J < N.Name.size(); J++) {
|
||||
if (!isAlnum(N.Name[J]))
|
||||
continue;
|
||||
|
||||
Get(0, Row) = Row;
|
||||
|
||||
for (std::size_t I = 1; I < Columns; I++) {
|
||||
const int Delete = Get(I - 1, Row) + 1;
|
||||
const int Insert = Get(I, Row - 1) + 1;
|
||||
|
||||
const int Replace =
|
||||
Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
|
||||
|
||||
Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
|
||||
}
|
||||
|
||||
Row++;
|
||||
}
|
||||
|
||||
unsigned Cost = Get(Columns - 1, Row - 1);
|
||||
if (N.Value != 0xFFFFFFFF) {
|
||||
Insert(N, Cost, N.Value);
|
||||
}
|
||||
|
||||
if (N.hasChildren()) {
|
||||
auto ChildOffset = N.ChildrenOffset;
|
||||
for (;;) {
|
||||
Node C = readNode(ChildOffset, &N);
|
||||
ChildOffset += C.Size;
|
||||
if (!C.isValid())
|
||||
break;
|
||||
VisitNode(C, Row, VisitNode);
|
||||
if (!C.HasSibling)
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Node Root = createRoot();
|
||||
VisitNode(Root, 1, VisitNode);
|
||||
return Matches;
|
||||
}
|
||||
|
||||
} // namespace unicode
|
||||
|
||||
} // namespace sys
|
||||
} // namespace llvm
|
20911
llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
Normal file
20911
llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,10 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Support/Unicode.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/edit_distance.h"
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace llvm {
|
||||
@ -101,6 +104,318 @@ TEST(Unicode, isPrintable) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Unicode, nameToCodepointStrict) {
|
||||
auto map = [](StringRef Str) {
|
||||
return nameToCodepointStrict(Str).getValueOr(0xFFFF'FFFF);
|
||||
};
|
||||
|
||||
// generated codepoints
|
||||
EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
|
||||
EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
|
||||
EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
|
||||
EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
|
||||
EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
|
||||
EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
|
||||
EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
|
||||
EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
|
||||
EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
|
||||
EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
|
||||
EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
|
||||
EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
|
||||
EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
|
||||
EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
|
||||
EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
|
||||
EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
|
||||
EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
|
||||
EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
|
||||
EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
|
||||
EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
|
||||
EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
|
||||
EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
|
||||
EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
|
||||
EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
|
||||
EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
|
||||
EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
|
||||
EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
|
||||
EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
|
||||
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
|
||||
|
||||
EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
|
||||
EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
|
||||
EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
|
||||
EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
|
||||
EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
|
||||
EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
|
||||
EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
|
||||
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
|
||||
|
||||
EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
|
||||
EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
|
||||
EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
|
||||
EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
|
||||
EXPECT_EQ(0x02235u, map("BECAUSE"));
|
||||
EXPECT_EQ(0x1F514u, map("BELL"));
|
||||
EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
|
||||
EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
|
||||
EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
|
||||
|
||||
// Aliases
|
||||
EXPECT_EQ(0x0000u, map("NULL"));
|
||||
EXPECT_EQ(0x0007u, map("ALERT"));
|
||||
EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
|
||||
EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
|
||||
EXPECT_EQ(0x000Au, map("LINE FEED"));
|
||||
EXPECT_EQ(0x000Au, map("NEW LINE"));
|
||||
EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
|
||||
EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
|
||||
EXPECT_EQ(0x2118u,
|
||||
map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
|
||||
EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
|
||||
EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
|
||||
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
|
||||
|
||||
// Should perform exact case match
|
||||
EXPECT_EQ(0xFFFFFFFFu, map(""));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("unicorn face"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
|
||||
EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE"));
|
||||
|
||||
// Should not support abbreviations or figments
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("FVS1"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("BEL"));
|
||||
}
|
||||
|
||||
TEST(Unicode, nameToCodepointLoose) {
|
||||
auto map = [](StringRef Str) {
|
||||
auto Opt = nameToCodepointLooseMatching(Str);
|
||||
if (!Opt)
|
||||
return char32_t(0xFFFF'FFFF);
|
||||
return Opt->CodePoint;
|
||||
};
|
||||
|
||||
// generated codepoints
|
||||
EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
|
||||
EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
|
||||
EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
|
||||
EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
|
||||
EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
|
||||
EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
|
||||
EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
|
||||
EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
|
||||
EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
|
||||
EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
|
||||
EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
|
||||
EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
|
||||
EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
|
||||
EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
|
||||
EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
|
||||
EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
|
||||
EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
|
||||
EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
|
||||
EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
|
||||
EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
|
||||
EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
|
||||
EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
|
||||
EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
|
||||
EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
|
||||
EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
|
||||
EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
|
||||
EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
|
||||
EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
|
||||
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
|
||||
|
||||
EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
|
||||
EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
|
||||
EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
|
||||
EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
|
||||
EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
|
||||
EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
|
||||
EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
|
||||
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
|
||||
|
||||
EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
|
||||
EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
|
||||
EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
|
||||
EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
|
||||
EXPECT_EQ(0x02235u, map("BECAUSE"));
|
||||
EXPECT_EQ(0x1F514u, map("BELL"));
|
||||
EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
|
||||
EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
|
||||
EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
|
||||
|
||||
// Aliases
|
||||
EXPECT_EQ(0x0000u, map("NULL"));
|
||||
EXPECT_EQ(0x0007u, map("ALERT"));
|
||||
EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
|
||||
EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
|
||||
EXPECT_EQ(0x000Au, map("LINE FEED"));
|
||||
EXPECT_EQ(0x000Au, map("NEW LINE"));
|
||||
EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
|
||||
EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
|
||||
EXPECT_EQ(0x2118u,
|
||||
map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
|
||||
EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
|
||||
EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
|
||||
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
|
||||
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate
|
||||
|
||||
// Should perform loose matching
|
||||
EXPECT_EQ(0xFFFFFFFFu, map(""));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
|
||||
EXPECT_EQ(0x0001F984u, map("unicorn face"));
|
||||
EXPECT_EQ(0x0001F984u, map("UNICORN FaCE"));
|
||||
EXPECT_EQ(0x0001F984u, map("UNICORNFaCE"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
|
||||
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i"));
|
||||
EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i"));
|
||||
EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
|
||||
|
||||
EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
|
||||
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
|
||||
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
|
||||
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
|
||||
|
||||
// https://unicode.org/reports/tr44/#Matching_Names
|
||||
// UAX44-LM2: Medial hypens are ignored, non medial hyphens are not
|
||||
EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --"));
|
||||
EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE"));
|
||||
|
||||
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A"));
|
||||
EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA"));
|
||||
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
|
||||
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
|
||||
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
|
||||
;
|
||||
|
||||
// special case
|
||||
EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
|
||||
EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE"));
|
||||
|
||||
// names that are prefix to existing characters should not match
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("B"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("BE"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("BEE"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("BEET"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("BEETL"));
|
||||
EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE"));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool operator==(MatchForCodepointName a, MatchForCodepointName b) {
|
||||
return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
TEST(Unicode, nearestMatchesForCodepointName) {
|
||||
auto Normalize = [](StringRef Name) {
|
||||
std::string Out;
|
||||
Out.reserve(Name.size());
|
||||
for (char C : Name) {
|
||||
if (isAlnum(C))
|
||||
Out.push_back(toUpper(C));
|
||||
}
|
||||
return Out;
|
||||
};
|
||||
|
||||
auto L = [&](StringRef name) {
|
||||
auto v = nearestMatchesForCodepointName(name, 3);
|
||||
for (auto &r : v) {
|
||||
auto A = Normalize(r.Name);
|
||||
auto B = Normalize(name);
|
||||
EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance);
|
||||
}
|
||||
return v;
|
||||
};
|
||||
using ::testing::ElementsAre;
|
||||
using M = MatchForCodepointName;
|
||||
|
||||
ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C},
|
||||
M{"ARC", 3, 0x2312}));
|
||||
// shortest name
|
||||
ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93},
|
||||
M{"BOY", 2, 0x1F466}));
|
||||
|
||||
// longest name
|
||||
ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF "
|
||||
"MAKSURA INITIAL FORM"),
|
||||
ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA INITIAL FORM",
|
||||
0, 0xFBFB},
|
||||
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA FINAL FORM",
|
||||
4, 0xFBFA},
|
||||
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA ISOLATED FORM",
|
||||
7, 0xFBF9}));
|
||||
|
||||
// same result with underscore, spaces, etc
|
||||
ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH "
|
||||
"ALEF MAKsURAINITIAL form_"),
|
||||
ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA INITIAL FORM",
|
||||
0, 0xFBFB},
|
||||
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA FINAL FORM",
|
||||
4, 0xFBFA},
|
||||
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
|
||||
"ABOVE WITH ALEF MAKSURA ISOLATED FORM",
|
||||
7, 0xFBF9}));
|
||||
|
||||
ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"),
|
||||
ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
|
||||
M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
|
||||
M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
|
||||
|
||||
ASSERT_THAT(L("greekcapitalletter-lambda"),
|
||||
ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
|
||||
M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
|
||||
M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
|
||||
|
||||
// typo http://www.unicode.org/notes/tn27/tn27-5.html
|
||||
ASSERT_THAT(
|
||||
L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"),
|
||||
ElementsAre(
|
||||
M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0,
|
||||
0xFE18}, // typo
|
||||
M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2,
|
||||
0xFE18}, // correction
|
||||
M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6,
|
||||
0xFE17}));
|
||||
|
||||
// typo http://www.unicode.org/notes/tn27/tn27-5.html
|
||||
ASSERT_THAT(
|
||||
L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"),
|
||||
ElementsAre(
|
||||
M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5},
|
||||
M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5},
|
||||
M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7,
|
||||
0x1D0C6}));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace unicode
|
||||
} // namespace sys
|
||||
|
5
llvm/utils/UnicodeData/CMakeLists.txt
Normal file
5
llvm/utils/UnicodeData/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
||||
set(LLVM_LINK_COMPONENTS Support)
|
||||
|
||||
add_llvm_utility(UnicodeNameMappingGenerator
|
||||
UnicodeNameMappingGenerator.cpp
|
||||
)
|
486
llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
Normal file
486
llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
Normal file
@ -0,0 +1,486 @@
|
||||
//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
|
||||
// using UnicodeData.txt and NameAliases.txt available at
|
||||
// https://unicode.org/Public/14.0.0/ucd/
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <deque>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
static const llvm::StringRef Letters =
|
||||
" _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
|
||||
|
||||
// Collect names UnicodeData.txt and AliasNames.txt
|
||||
// There may be multiple names per code points.
|
||||
static std::unordered_multimap<char32_t, std::string>
|
||||
loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
|
||||
std::unordered_multimap<char32_t, std::string> CollectedCharacters;
|
||||
auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
|
||||
std::ifstream InputFile(File);
|
||||
for (std::string Line; getline(InputFile, Line);) {
|
||||
if (Line.empty() || !isxdigit(Line[0]))
|
||||
continue;
|
||||
auto FirstSemiPos = Line.find(';');
|
||||
if (FirstSemiPos == std::string::npos)
|
||||
continue;
|
||||
auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
|
||||
if (FirstSemiPos == std::string::npos)
|
||||
continue;
|
||||
unsigned long long CodePoint;
|
||||
if (llvm::getAsUnsignedInteger(
|
||||
llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string Name =
|
||||
Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
|
||||
|
||||
if (!Name.empty() && Name[0] == '<') {
|
||||
// Ignore ranges of characters, as their name is either absent or
|
||||
// generated.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Some aliases are ignored for compatibility with C++
|
||||
if (IsAliasFile) {
|
||||
std::string Kind = Line.substr(SecondSemiPos + 1);
|
||||
if (Kind != "control" && Kind != "correction" && Kind != "alternate")
|
||||
continue;
|
||||
}
|
||||
|
||||
auto InsertUnique = [&](char32_t CP, std::string Name) {
|
||||
auto It = CollectedCharacters.find(CP);
|
||||
while (It != std::end(CollectedCharacters) && It->first == CP) {
|
||||
if (It->second == Name)
|
||||
return;
|
||||
++It;
|
||||
}
|
||||
CollectedCharacters.insert({CP, std::move(Name)});
|
||||
};
|
||||
InsertUnique(CodePoint, std::move(Name));
|
||||
}
|
||||
};
|
||||
|
||||
FromFile(NamesFile);
|
||||
FromFile(AliasesFile, true);
|
||||
return CollectedCharacters;
|
||||
}
|
||||
|
||||
class Trie {
|
||||
struct Node;
|
||||
|
||||
public:
|
||||
// When inserting named codepoint
|
||||
// We create a node per character in the name.
|
||||
// SPARKLE becomes S <- P <- A <- R <- K <- L <- E
|
||||
// Once all characters are inserted, the tree is compacted
|
||||
void insert(llvm::StringRef Name, char32_t Codepoint) {
|
||||
Node *N = Root.get();
|
||||
for (auto Ch : Name) {
|
||||
std::string Label(1, Ch);
|
||||
auto It = std::find_if(N->Children.begin(), N->Children.end(),
|
||||
[&](const auto &C) { return C->Name == Label; });
|
||||
if (It == N->Children.end()) {
|
||||
It = N->Children.insert(It, std::make_unique<Node>(Label, N));
|
||||
}
|
||||
N = It->get();
|
||||
}
|
||||
N->Value = Codepoint;
|
||||
}
|
||||
|
||||
void compact() { compact(Root.get()); }
|
||||
|
||||
// This creates 2 arrays of bytes from the tree:
|
||||
// A serialized dictionary of node labels,
|
||||
// And the nodes themselves.
|
||||
// The name of each label is found by indexing into the dictionary.
|
||||
// The longest names are inserted first into the dictionary,
|
||||
// in the hope it will contain shorter labels as substring,
|
||||
// thereby reducing duplication.
|
||||
// We could theorically be more clever by trying to minimizing the size
|
||||
// of the dictionary.
|
||||
std::pair<std::string, std::vector<uint8_t>> serialize() {
|
||||
std::set<std::string> Names = this->getNameFragments();
|
||||
std::vector<std::string> Sorted(Names.begin(), Names.end());
|
||||
std::sort(Sorted.begin(), Sorted.end(),
|
||||
[](const auto &a, const auto &b) { return a.size() > b.size(); });
|
||||
std::string Dict(Letters.begin(), Letters.end());
|
||||
Dict.reserve(50000);
|
||||
for (const std::string &Name : Sorted) {
|
||||
if (Name.size() <= 1)
|
||||
continue;
|
||||
if (Dict.find(Name) != std::string::npos)
|
||||
continue;
|
||||
Dict += Name;
|
||||
}
|
||||
|
||||
if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
|
||||
fprintf(stderr, "Dictionary too big to be serialized");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
auto Bytes = dumpIndex(Dict);
|
||||
return {Dict, Bytes};
|
||||
}
|
||||
|
||||
std::set<std::string> getNameFragments() {
|
||||
std::set<std::string> Keys;
|
||||
collectKeys(Root.get(), Keys);
|
||||
return Keys;
|
||||
}
|
||||
|
||||
// Maps a valid char in an Unicode character name
|
||||
// To a 6 bits index.
|
||||
static uint8_t letter(char C) {
|
||||
auto Pos = Letters.find(C);
|
||||
assert(Pos != std::string::npos &&
|
||||
"Invalid letter in Unicode character name");
|
||||
return Pos;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
// +================+============+======================+=============+========+===+==============+===============+
|
||||
// | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
|
||||
// +================+============+======================+=============+========+===+==============+===============+
|
||||
// | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
|
||||
// +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
|
||||
// clang-format on
|
||||
|
||||
std::vector<uint8_t> dumpIndex(const std::string &Dict) {
|
||||
struct ChildrenOffset {
|
||||
Node *FirstChild;
|
||||
std::size_t Offset;
|
||||
bool HasValue;
|
||||
};
|
||||
|
||||
// Keep track of the start of each node
|
||||
// position in the serialized data.
|
||||
std::unordered_map<Node *, int32_t> Offsets;
|
||||
|
||||
// Keep track of where to write the index
|
||||
// of the first children
|
||||
std::vector<ChildrenOffset> ChildrenOffsets;
|
||||
std::unordered_map<Node *, bool> SiblingTracker;
|
||||
std::deque<Node *> AllNodes;
|
||||
std::vector<uint8_t> Bytes;
|
||||
Bytes.reserve(250'000);
|
||||
// This leading byte is used by the reading code to detect the root node.
|
||||
Bytes.push_back(0);
|
||||
|
||||
auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
|
||||
for (std::size_t Index = 0; Index < Children.size(); Index++) {
|
||||
const std::unique_ptr<Node> &Child = Children[Index];
|
||||
AllNodes.push_back(Child.get());
|
||||
if (Index != Children.size() - 1)
|
||||
SiblingTracker[Child.get()] = true;
|
||||
}
|
||||
};
|
||||
CollectChildren(Root->Children);
|
||||
|
||||
while (!AllNodes.empty()) {
|
||||
const std::size_t Offset = Bytes.size();
|
||||
Node *const N = AllNodes.front();
|
||||
AllNodes.pop_front();
|
||||
|
||||
assert(!N->Name.empty());
|
||||
Offsets[N] = Offset;
|
||||
|
||||
uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
|
||||
// Single letter node are indexed in 6 bits
|
||||
if (N->Name.size() == 1) {
|
||||
FirstByte |= letter(N->Name[0]);
|
||||
Bytes.push_back(FirstByte);
|
||||
} else {
|
||||
// Otherwise we use a 16 bits index
|
||||
FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
|
||||
Bytes.push_back(FirstByte);
|
||||
auto PosInDict = Dict.find(N->Name);
|
||||
assert(PosInDict != std::string::npos);
|
||||
uint8_t Low = PosInDict;
|
||||
uint8_t High = ((PosInDict >> 8) & 0xFF);
|
||||
Bytes.push_back(High);
|
||||
Bytes.push_back(Low);
|
||||
}
|
||||
|
||||
const bool HasSibling = SiblingTracker.count(N) != 0;
|
||||
const bool HasChildren = N->Children.size() != 0;
|
||||
|
||||
if (!!N->Value) {
|
||||
uint32_t Value = (*(N->Value) << 3);
|
||||
uint8_t H = ((Value >> 16) & 0xFF);
|
||||
uint8_t M = ((Value >> 8) & 0xFF);
|
||||
uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
|
||||
uint8_t(HasChildren ? 0x02 : 0);
|
||||
|
||||
Bytes.push_back(H);
|
||||
Bytes.push_back(M);
|
||||
Bytes.push_back(L);
|
||||
|
||||
if (HasChildren) {
|
||||
ChildrenOffsets.push_back(
|
||||
ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
|
||||
// index of the first children
|
||||
Bytes.push_back(0x00);
|
||||
Bytes.push_back(0x00);
|
||||
Bytes.push_back(0x00);
|
||||
}
|
||||
} else {
|
||||
// When there is no value (that's most intermediate nodes)
|
||||
// Dispense of the 3 values bytes, and only store
|
||||
// 1 byte to track whether the node has sibling and chidren
|
||||
// + 2 bytes for the index of the first children if necessary.
|
||||
// That index also uses bytes 0-6 of the previous byte.
|
||||
uint8_t Byte =
|
||||
uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
|
||||
Bytes.push_back(Byte);
|
||||
if (HasChildren) {
|
||||
ChildrenOffsets.emplace_back(
|
||||
ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
|
||||
Bytes.push_back(0x00);
|
||||
Bytes.push_back(0x00);
|
||||
}
|
||||
}
|
||||
CollectChildren(N->Children);
|
||||
}
|
||||
|
||||
// Once all the nodes are in the inndex
|
||||
// Fill the bytes we left to indicate the position
|
||||
// of the children
|
||||
for (const ChildrenOffset &Parent : ChildrenOffsets) {
|
||||
const auto It = Offsets.find(Parent.FirstChild);
|
||||
assert(It != Offsets.end());
|
||||
std::size_t Pos = It->second;
|
||||
if (Parent.HasValue) {
|
||||
Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
|
||||
} else {
|
||||
Bytes[Parent.Offset] =
|
||||
Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
|
||||
}
|
||||
Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
|
||||
Bytes[Parent.Offset + 2] = Pos & 0xFF;
|
||||
}
|
||||
|
||||
// Add some padding so that the deserialization code
|
||||
// doesn't try to read past the enf of the array.
|
||||
Bytes.push_back(0);
|
||||
Bytes.push_back(0);
|
||||
Bytes.push_back(0);
|
||||
Bytes.push_back(0);
|
||||
Bytes.push_back(0);
|
||||
Bytes.push_back(0);
|
||||
|
||||
return Bytes;
|
||||
}
|
||||
|
||||
private:
|
||||
void collectKeys(Node *N, std::set<std::string> &Keys) {
|
||||
Keys.insert(N->Name);
|
||||
for (const std::unique_ptr<Node> &Child : N->Children) {
|
||||
collectKeys(Child.get(), Keys);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge sequences of 1-character nodes
|
||||
// This greatly reduce the total number of nodes,
|
||||
// and therefore the size of the index.
|
||||
// When the tree gets serialized, we only have 5 bytes to store the
|
||||
// size of a name. Overlong names (>32 characters) are therefore
|
||||
// kep into separate nodes
|
||||
void compact(Node *N) {
|
||||
for (auto &&Child : N->Children) {
|
||||
compact(Child.get());
|
||||
}
|
||||
if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
|
||||
(N->Parent->Name.size() + N->Name.size() <= 32)) {
|
||||
N->Parent->Value = N->Value;
|
||||
N->Parent->Name += N->Name;
|
||||
N->Parent->Children = std::move(N->Children);
|
||||
for (std::unique_ptr<Node> &c : N->Parent->Children) {
|
||||
c->Parent = N->Parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
struct Node {
|
||||
Node(std::string Name, Node *Parent = nullptr)
|
||||
: Name(Name), Parent(Parent) {}
|
||||
|
||||
std::vector<std::unique_ptr<Node>> Children;
|
||||
std::string Name;
|
||||
Node *Parent = nullptr;
|
||||
llvm::Optional<char32_t> Value;
|
||||
};
|
||||
|
||||
std::unique_ptr<Node> Root = std::make_unique<Node>("");
|
||||
};
|
||||
|
||||
extern const char *UnicodeLicense;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("Unicode name -> codepoint mapping generator\n"
|
||||
"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
|
||||
argv[0]);
|
||||
printf("NameAliases.txt can be found at "
|
||||
"https://unicode.org/Public/14.0.0/ucd/NameAliases.txt\n"
|
||||
"UnicodeData.txt can be found at "
|
||||
"https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt\n\n");
|
||||
|
||||
if (argc != 4)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
FILE *Out = fopen(argv[3], "w");
|
||||
if (!Out) {
|
||||
printf("Error creating output file.\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
Trie T;
|
||||
uint32_t NameCount = 0;
|
||||
std::size_t LongestName = 0;
|
||||
auto Entries = loadDataFiles(argv[1], argv[2]);
|
||||
for (const std::pair<const char32_t, std::string> &Entry : Entries) {
|
||||
char32_t Codepoint = Entry.first;
|
||||
const std::string &Name = Entry.second;
|
||||
// Ignore names which are not valid.
|
||||
if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) {
|
||||
return llvm::is_contained(Letters, C);
|
||||
})) {
|
||||
continue;
|
||||
}
|
||||
printf("%06x: %s\n", Codepoint, Name.c_str());
|
||||
T.insert(Name, Codepoint);
|
||||
LongestName =
|
||||
std::max(LongestName, std::size_t(llvm::count_if(Name, [](char c) {
|
||||
return llvm::isAlnum(c);
|
||||
})));
|
||||
NameCount++;
|
||||
}
|
||||
T.compact();
|
||||
|
||||
std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
|
||||
const std::string &Dict = Data.first;
|
||||
const std::vector<uint8_t> &Tree = Data.second;
|
||||
|
||||
fprintf(Out, R"(
|
||||
//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements mapping the name of a unicode code point to its value.
|
||||
//
|
||||
// This file was generated using %s.
|
||||
// Do not edit manually.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
%s
|
||||
|
||||
|
||||
|
||||
#include "llvm/Support/Compiler.h"
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
)",
|
||||
argv[0], UnicodeLicense);
|
||||
|
||||
fprintf(Out,
|
||||
"namespace llvm { namespace sys { namespace unicode { \n"
|
||||
"extern const char *UnicodeNameToCodepointDict;\n"
|
||||
"extern const uint8_t *UnicodeNameToCodepointIndex;\n"
|
||||
"extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
|
||||
"extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
|
||||
|
||||
fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
|
||||
Dict.c_str());
|
||||
|
||||
fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%lu] = {\n",
|
||||
Tree.size() + 1);
|
||||
|
||||
for (auto Byte : Tree) {
|
||||
fprintf(Out, "0x%02x,", Byte);
|
||||
}
|
||||
|
||||
fprintf(Out, "0};");
|
||||
fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
|
||||
"UnicodeNameToCodepointIndex_; \n");
|
||||
fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %lu;\n",
|
||||
Tree.size() + 1);
|
||||
fprintf(Out,
|
||||
"const std::size_t UnicodeNameToCodepointLargestNameSize = %lu;\n",
|
||||
LongestName);
|
||||
fprintf(Out, "\n}}}\n");
|
||||
fclose(Out);
|
||||
printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
|
||||
argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
|
||||
}
|
||||
|
||||
const char *UnicodeLicense = R"(
|
||||
/*
|
||||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
|
||||
See Terms of Use <https://www.unicode.org/copyright.html>
|
||||
for definitions of Unicode Inc.’s Data Files and Software.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement.
|
||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
THE DATA FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2022 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Unicode data files and any associated documentation
|
||||
(the "Data Files") or Unicode software and any associated documentation
|
||||
(the "Software") to deal in the Data Files or Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
the Data Files or Software, and to permit persons to whom the Data Files
|
||||
or Software are furnished to do so, provided that either
|
||||
(a) this copyright and permission notice appear with all copies
|
||||
of the Data Files or Software, or
|
||||
(b) this copyright and permission notice appear in associated
|
||||
Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder
|
||||
shall not be used in advertising or otherwise to promote the sale,
|
||||
use or other dealings in these Data Files or Software without prior
|
||||
written authorization of the copyright holder.
|
||||
*/
|
||||
)";
|
Loading…
x
Reference in New Issue
Block a user