[Clang][C++23] P2071 Named universal character escapes

Implements [[ https://wg21.link/p2071r1  | P2071 Named Universal Character Escapes ]] - as an extension in all language mode, the patch  not warn in c++23 mode will be done later once this paper is plenary approved (in July).

We add

 * A code generator that transforms `UnicodeData.txt` and `NameAliases.txt` to a space efficient data structure that can be queried in `O(NameLength)`
 * A set of functions in `Unicode.h` to query that data, including

   * A function to find an exact match of a given Unicode character name
   * A function to perform a loose (ignoring case, space, underscore, medial hyphen) matching
   * A function returning the best matching codepoint for a given string per edit distance

 * Support of `\N{}` escape sequences in String and character Literals, with loose and typos diagnostics/fixits
 * Support of `\N{}` as UCN with loose matching diagnostics/fixits.

Loose matching is considered an error to match closely the semantics of P2071.

The generated data contributes to 280kB of data to the binaries.

`UnicodeData.txt` and `NameAliases.txt`  are not committed to the repository in this patch, and regenerating the data is a manual process.

Reviewed By: tahonermann

Differential Revision: https://reviews.llvm.org/D123064
This commit is contained in:
Corentin Jabot 2022-04-04 12:41:12 +02:00
parent f8c1c9afd3
commit c92056d038
18 changed files with 22720 additions and 53 deletions

View File

@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
def ext_delimited_escape_sequence : Extension<
"delimited escape sequences are a Clang extension">,
"%select{delimited|named}0 escape sequences are a Clang extension">,
InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
def err_delimited_escape_empty : Error<
"delimited escape sequence cannot be empty">;
@ -138,6 +138,13 @@ def err_delimited_escape_invalid : Error<
"invalid digit '%0' in escape sequence">;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
def err_invalid_ucn_name : Error<
"'%0' is not a valid Unicode character name">;
def note_invalid_ucn_name_loose_matching : Note<
"characters names in Unicode escape sequences are sensitive to case and whitespaces">;
def note_invalid_ucn_name_candidate : Note<
"did you mean %0 ('%2' U+%1)?">;
def warn_ucn_escape_no_digits : Warning<
"\\%0 used with no following hex digits; "
"treating as '\\' followed by identifier">, InGroup<Unicode>;
@ -145,10 +152,10 @@ def err_ucn_escape_incomplete : Error<
"incomplete universal character name">;
def warn_delimited_ucn_incomplete : Warning<
"incomplete delimited universal character name; "
"treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
"treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
def warn_delimited_ucn_empty : Warning<
"empty delimited universal character name; "
"treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
"treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
def warn_ucn_escape_incomplete : Warning<
"incomplete universal character name; "
"treating as '\\' followed by identifier">, InGroup<Unicode>;

View File

@ -769,6 +769,11 @@ private:
void codeCompleteIncludedFile(const char *PathStart,
const char *CompletionPoint, bool IsAngled);
llvm::Optional<uint32_t>
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
Token *Result);
/// Read a universal character name.
///
/// \param StartPtr The position in the source buffer after the initial '\'.

View File

@ -37,6 +37,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBufferRef.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/UnicodeCharRanges.h"
#include <algorithm>
#include <cassert>
@ -3119,27 +3120,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
return false;
}
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Token *Result) {
llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
const char *SlashLoc,
Token *Result) {
unsigned CharSize;
char Kind = getCharAndSize(StartPtr, CharSize);
bool Delimited = false;
bool FoundEndDelimiter = false;
unsigned Count = 0;
bool Diagnose = Result && !isLexingRawMode();
assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
unsigned NumHexDigits;
if (Kind == 'u')
NumHexDigits = 4;
else if (Kind == 'U')
NumHexDigits = 8;
else
return 0;
bool Delimited = false;
bool FoundEndDelimiter = false;
unsigned Count = 0;
bool Diagnose = Result && !isLexingRawMode();
if (!LangOpts.CPlusPlus && !LangOpts.C99) {
if (Diagnose)
Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
return 0;
return llvm::None;
}
const char *CurPtr = StartPtr + CharSize;
@ -3166,14 +3168,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
break;
if (Diagnose)
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
<< StringRef(&C, 1);
return 0;
<< StringRef(KindLoc, 1);
return llvm::None;
}
if (CodePoint & 0xF000'0000) {
if (Diagnose)
Diag(KindLoc, diag::err_escape_too_large) << 0;
return 0;
return llvm::None;
}
CodePoint <<= 4;
@ -3187,7 +3189,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
return 0;
return llvm::None;
}
if (Delimited && Kind == 'U') {
if (Diagnose)
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
return llvm::None;
}
if (!Delimited && Count != NumHexDigits) {
@ -3200,11 +3208,11 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
<< FixItHint::CreateReplacement(URange, "u");
}
}
return 0;
return llvm::None;
}
if (Delimited && PP) {
Diag(BufferPtr, diag::ext_delimited_escape_sequence);
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
}
if (Result) {
@ -3217,6 +3225,110 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
} else {
StartPtr = CurPtr;
}
return CodePoint;
}
llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
Token *Result) {
unsigned CharSize;
bool Diagnose = Result && !isLexingRawMode();
char C = getCharAndSize(StartPtr, CharSize);
assert(C == 'N' && "expected \\N{...}");
const char *CurPtr = StartPtr + CharSize;
const char *KindLoc = &CurPtr[-1];
C = getCharAndSize(CurPtr, CharSize);
if (C != '{') {
if (Diagnose)
Diag(StartPtr, diag::warn_ucn_escape_incomplete);
return llvm::None;
}
CurPtr += CharSize;
const char *StartName = CurPtr;
bool FoundEndDelimiter = false;
llvm::SmallVector<char, 30> Buffer;
while (C) {
C = getCharAndSize(CurPtr, CharSize);
CurPtr += CharSize;
if (C == '}') {
FoundEndDelimiter = true;
break;
}
if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
break;
Buffer.push_back(C);
}
if (!FoundEndDelimiter || Buffer.empty()) {
if (Diagnose)
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return llvm::None;
}
StringRef Name(Buffer.data(), Buffer.size());
llvm::Optional<char32_t> Res =
llvm::sys::unicode::nameToCodepointStrict(Name);
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
if (!Res) {
if (!isLexingRawMode()) {
Diag(StartPtr, diag::err_invalid_ucn_name)
<< StringRef(Buffer.data(), Buffer.size());
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
if (LooseMatch) {
Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
<< FixItHint::CreateReplacement(
makeCharRange(*this, StartName, CurPtr - CharSize),
LooseMatch->Name);
}
}
// When finding a match using Unicode loose matching rules
// recover after having emitted a diagnostic.
if (!LooseMatch)
return llvm::None;
// We do not offer missspelled character names suggestions here
// as the set of what would be a valid suggestion depends on context,
// and we should not make invalid suggestions.
}
if (Diagnose && PP && !LooseMatch)
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
if (LooseMatch)
Res = LooseMatch->CodePoint;
if (Result) {
Result->setFlag(Token::HasUCN);
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
(void)getAndAdvanceChar(StartPtr, *Result);
} else {
StartPtr = CurPtr;
}
return *Res;
}
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Token *Result) {
unsigned CharSize;
llvm::Optional<uint32_t> CodePointOpt;
char Kind = getCharAndSize(StartPtr, CharSize);
if (Kind == 'u' || Kind == 'U')
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
else if (Kind == 'N')
CodePointOpt = tryReadNamedUCN(StartPtr, Result);
if (!CodePointOpt)
return 0;
uint32_t CodePoint = *CodePointOpt;
// Don't apply C family restrictions to UCNs in assembly mode
if (LangOpts.AsmPreprocessor)

View File

@ -27,6 +27,7 @@
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Unicode.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@ -233,7 +234,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_delimited_escape_missing_brace);
diag::err_delimited_escape_missing_brace)
<< "o";
break;
}
@ -309,7 +311,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
<< tok::r_brace;
else if (!HadError) {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::ext_delimited_escape_sequence);
diag::ext_delimited_escape_sequence)
<< /*delimited*/ 0;
}
}
@ -335,7 +338,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
char Kind = *I;
++I;
assert(Kind == 'u' || Kind == 'U');
assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
uint32_t CodePoint = 0;
if (Kind == 'u' && *I == '{') {
@ -349,6 +352,22 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
continue;
}
if (Kind == 'N') {
assert(*I == '{');
++I;
auto Delim = std::find(I, Input.end(), '}');
assert(Delim != Input.end());
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
llvm::sys::unicode::nameToCodepointLooseMatching(
StringRef(I, std::distance(I, Delim)));
assert(Res);
CodePoint = Res->CodePoint;
assert(CodePoint != 0xFFFFFFFF);
appendCodePoint(CodePoint, Buf);
I = Delim;
continue;
}
unsigned NumHexDigits;
if (Kind == 'u')
NumHexDigits = 4;
@ -370,23 +389,20 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
}
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
const char *ThisTokEnd,
uint32_t &UcnVal, unsigned short &UcnLen,
FullSourceLoc Loc, DiagnosticsEngine *Diags,
const LangOptions &Features,
bool in_char_string_literal = false) {
static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
const char *&ThisTokBuf,
const char *ThisTokEnd, uint32_t &UcnVal,
unsigned short &UcnLen, bool &Delimited,
FullSourceLoc Loc, DiagnosticsEngine *Diags,
const LangOptions &Features,
bool in_char_string_literal = false) {
const char *UcnBegin = ThisTokBuf;
bool HasError = false;
bool EndDelimiterFound = false;
// Skip the '\u' char's.
ThisTokBuf += 2;
bool Delimited = false;
bool EndDelimiterFound = false;
bool HasError = false;
Delimited = false;
if (UcnBegin[1] == 'u' && in_char_string_literal &&
ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@ -394,7 +410,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
diag::err_hex_escape_no_digits)
<< StringRef(&ThisTokBuf[-1], 1);
return false;
}
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@ -455,7 +472,136 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
: diag::err_ucn_escape_incomplete);
return false;
}
return !HasError;
}
static void DiagnoseInvalidUnicodeCharacterName(
DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
llvm::StringRef Name) {
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
diag::err_invalid_ucn_name)
<< Name;
namespace u = llvm::sys::unicode;
llvm::Optional<u::LooseMatchingResult> Res =
u::nameToCodepointLooseMatching(Name);
if (Res) {
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
diag::note_invalid_ucn_name_loose_matching)
<< FixItHint::CreateReplacement(
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
TokRangeEnd),
Res->Name);
return;
}
unsigned Distance = 0;
SmallVector<u::MatchForCodepointName> Matches =
u::nearestMatchesForCodepointName(Name, 5);
assert(!Matches.empty() && "No unicode characters found");
for (const auto &Match : Matches) {
if (Distance == 0)
Distance = Match.Distance;
if (std::max(Distance, Match.Distance) -
std::min(Distance, Match.Distance) >
3)
break;
Distance = Match.Distance;
std::string Str;
llvm::UTF32 V = Match.Value;
LLVM_ATTRIBUTE_UNUSED bool Converted =
llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
assert(Converted && "Found a match wich is not a unicode character");
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
diag::note_invalid_ucn_name_candidate)
<< Match.Name << llvm::utohexstr(Match.Value)
<< Str // FIXME: Fix the rendering of non printable characters
<< FixItHint::CreateReplacement(
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
TokRangeEnd),
Match.Name);
}
}
static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
const char *&ThisTokBuf,
const char *ThisTokEnd, uint32_t &UcnVal,
unsigned short &UcnLen, FullSourceLoc Loc,
DiagnosticsEngine *Diags,
const LangOptions &Features) {
const char *UcnBegin = ThisTokBuf;
assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
ThisTokBuf += 2;
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
if (Diags) {
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::err_delimited_escape_missing_brace)
<< StringRef(&ThisTokBuf[-1], 1);
}
ThisTokBuf++;
return false;
}
ThisTokBuf++;
const char *ClosingBrace =
std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
});
bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
bool Empty = ClosingBrace == ThisTokBuf;
if (Incomplete || Empty) {
if (Diags) {
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
Incomplete ? diag::err_ucn_escape_incomplete
: diag::err_delimited_escape_empty)
<< StringRef(&UcnBegin[1], 1);
}
ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
return false;
}
StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
ThisTokBuf = ClosingBrace + 1;
llvm::Optional<char32_t> Res =
llvm::sys::unicode::nameToCodepointStrict(Name);
if (!Res) {
if (Diags)
DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
&UcnBegin[3], ClosingBrace, Name);
return false;
}
UcnVal = *Res;
UcnLen = UcnVal > 0xFFFF ? 8 : 4;
return true;
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
const char *ThisTokEnd, uint32_t &UcnVal,
unsigned short &UcnLen, FullSourceLoc Loc,
DiagnosticsEngine *Diags,
const LangOptions &Features,
bool in_char_string_literal = false) {
bool HasError;
const char *UcnBegin = ThisTokBuf;
bool IsDelimitedEscapeSequence = false;
bool IsNamedEscapeSequence = false;
if (ThisTokBuf[1] == 'N') {
IsNamedEscapeSequence = true;
HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
UcnVal, UcnLen, Loc, Diags, Features);
} else {
HasError =
!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
Features, in_char_string_literal);
}
if (HasError)
return false;
@ -493,9 +639,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::warn_ucn_not_valid_in_c89_literal);
if (Delimited && Diags)
if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::ext_delimited_escape_sequence);
diag::ext_delimited_escape_sequence)
<< (IsNamedEscapeSequence ? 1 : 0);
return true;
}
@ -1559,7 +1706,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
continue;
}
// Is this a Universal Character Name escape?
if (begin[1] == 'u' || begin[1] == 'U') {
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
unsigned short UcnLen = 0;
if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
FullSourceLoc(Loc, PP.getSourceManager()),
@ -1919,7 +2066,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
ThisTokBuf[1] == 'N') {
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
ResultPtr, hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
@ -2112,7 +2260,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
// Otherwise, this is an escape character. Advance over it.
bool HadError = false;
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
SpellingPtr[1] == 'N') {
const char *EscapePtr = SpellingPtr;
unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1, Features, HadError);

View File

@ -0,0 +1,29 @@
// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck -check-prefix=CHECK-MACHINE %s
const char*
\N{GREEK_SMALL_LETTER-OMICRON} = // expected-error {{'GREEK_SMALL_LETTER-OMICRON' is not a valid Unicode character name}} \
// expected-note {{sensitive to case and whitespaces}}
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:4-[[@LINE-2]]:30}:"GREEK SMALL LETTER OMICRON"
"\N{zero width no break space}" // expected-error {{'zero width no break space' is not a valid Unicode character name}} \
// expected-note {{sensitive to case and whitespaces}}
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:30}:"ZERO WIDTH NO-BREAK SPACE"
"abc\N{MAN IN A BUSINESS SUIT LEVITATING}" // expected-error {{'MAN IN A BUSINESS SUIT LEVITATING' is not a valid Unicode character name}} \
// expected-note {{did you mean MAN IN BUSINESS SUIT LEVITATING ('🕴' U+1F574)?}}
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:8-[[@LINE-2]]:41}:"MAN IN BUSINESS SUIT LEVITATING"
"\N{AAA}" // expected-error {{'AAA' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:8}:"ANT"
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-3]]:5-[[@LINE-3]]:8}:"ARC"
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-4]]:5-[[@LINE-4]]:8}:"AXE"
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-5]]:5-[[@LINE-5]]:8}:"BAT"
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-6]]:5-[[@LINE-6]]:8}:"CAT"
"\N{BLACKCHESSBISHOP}" // expected-error {{'BLACKCHESSBISHOP' is not a valid Unicode character name}} \
// expected-note {{sensitive to case and whitespaces}}
// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-2]]:5-[[@LINE-2]]:21}:"BLACK CHESS BISHOP"
;

View File

@ -2,17 +2,20 @@
// RUN: %clang_cc1 -x c -std=gnu11 -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c++ -std=gnu++11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c -std=gnu11 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c++ -std=c++17 -ftrigraphs -fsyntax-only -pedantic -verify -DTRIGRAPHS=1 %s
const char *errors =
"\u{}" //expected-error {{delimited escape sequence cannot be empty}}
"\u{" //expected-error {{expected '}'}}
"\u{h}" //expected-error {{invalid digit 'h' in escape sequence}}
"\x{}" //expected-error {{delimited escape sequence cannot be empty}}
"\x{" //expected-error {{expected '}'}}
"\x{h}" //expected-error {{invalid digit 'h' in escape sequence}}
"\o{}" //expected-error {{delimited escape sequence cannot be empty}}
"\o{" //expected-error {{expected '}'}}
"\o{8}" //expected-error {{invalid digit '8' in escape sequence}}
"\u{}" // expected-error {{delimited escape sequence cannot be empty}}
"\u{" // expected-error {{expected '}'}}
"\u{h}" // expected-error {{invalid digit 'h' in escape sequence}}
"\x{}" // expected-error {{delimited escape sequence cannot be empty}}
"\x{" // expected-error {{expected '}'}}
"\x{h}" // expected-error {{invalid digit 'h' in escape sequence}}
"\o{}" // expected-error {{delimited escape sequence cannot be empty}}
"\o{" // expected-error {{expected '}'}}
"\o" // expected-error {{expected '{' after '\o' escape sequence}}
"\o{8}" // expected-error {{invalid digit '8' in escape sequence}}
"\U{8}" // expected-error {{\U used with no following hex digits}}
;
void ucn(void) {
@ -70,6 +73,30 @@ void concat(void) {
(void)"\o{12" "}"; // expected-error {{expected '}'}}
}
void named(void) {
char a = '\N{LOTUS}'; // expected-error{{character too large for enclosing character literal type}} \
// expected-warning {{extension}}
char b = '\N{DOLLAR SIGN}'; // expected-warning {{extension}}
char b_ = '\N{ DOL-LAR _SIGN }'; // expected-error {{' DOL-LAR _SIGN ' is not a valid Unicode character name}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
char c = '\N{NOTATHING}'; // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}
char d = '\N{}'; // expected-error {{delimited escape sequence cannot be empty}}
char e = '\N{'; // expected-error {{incomplete universal character name}}
unsigned f = L'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
unsigned g = u'\N{LOTUS}'; // expected-error {{character too large for enclosing character literal type}} \
// expected-warning {{extension}}
unsigned h = U'\N{LOTUS}'; // expected-warning {{extension}}
unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // expected-warning {{extension}}
char j = '\NN'; // expected-error {{expected '{' after '\N' escape sequence}}
unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}}
}
void separators(void) {
(void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
(void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
@ -79,3 +106,12 @@ void separators(void) {
// expected-error@-1 2{{expected ';'}}
// expected-warning@-2 3{{expression result unused}}
}
#if L'\N{GREEK CAPITAL LETTER GAMMA}' != L'Γ' // expected-warning {{extension}}
#error "oh no!"
#endif
#ifdef TRIGRAPHS
static_assert('\N??<DOLLAR SIGN??>' == '$'); // expected-warning 2{{trigraph converted}} \
// expected-warning {{named escape sequences are a Clang extension}}
#endif

View File

@ -39,9 +39,14 @@ extern int 𐠈;
extern int ;
extern int \u1B4C; // BALINESE LETTER ARCHAIC JNYA - Added in Unicode 14
extern int \U00016AA2; // TANGSA LETTER GA - Added in Unicode 14
extern int _\N{TANGSA LETTER GA};
extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
// This character doesn't have the XID_Start property
extern int \U00016AC0; // TANGSA DIGIT ZERO // expected-error {{expected unqualified-id}}
extern int _\U00016AC0; // TANGSA DIGIT ZERO
extern int 🌹; // expected-error {{unexpected character <U+1F339>}} \
expected-warning {{declaration does not declare anything}}

View File

@ -131,6 +131,7 @@ int operator""_\u212e""_\U0000212e""_""(const char*, size_t);
int operator""_\U0000212e""_""_\u212e""(const char*, size_t);
int operator""_\u{212f}(char);
int operator""_\N{SCRIPT SMALL E}(char);
int mix_ucn_utf8 = ""_""_\u212e""_\U0000212e"";

View File

@ -1,5 +1,6 @@
// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef -ftrigraphs -DTRIGRAPHS=1
// RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s
#define \u00FC
@ -29,9 +30,14 @@
// Make sure we reject disallowed UCNs
#define \ufffe // expected-error {{macro name must be an identifier}}
#define \U10000000 // expected-error {{macro name must be an identifier}}
#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
#define \U10000000 // expected-error {{macro name must be an identifier}}
#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}}
#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}}
#define \N{ALERT} // expected-error {{universal character name refers to a control character}} \
// expected-error {{macro name must be an identifier}} \
// expected-warning {{Clang extension}}
#define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
// expected-warning {{Clang extension}}
#define a\u0024
@ -113,3 +119,20 @@ C 1
#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \N{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \N{} // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}}
#define \N{NOTATHING} // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
// expected-error {{macro name must be an identifier}}
#define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
#define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
#define CONCAT(A, B) A##B
int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \
// expected-warning {{incomplete delimited universal character name}}
#ifdef TRIGRAPHS
int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // expected-warning{{amed escape sequences are a Clang extension}} \
// expected-warning 2{{trigraph converted}}
#endif

View File

@ -18,6 +18,7 @@ void goodCalls(void) {
über(2);
\U000000FCber(3);
\u{FC}ber(4); // expected-warning {{Clang extension}}
\N{LATIN SMALL LETTER U WITH DIAERESIS}ber(4); // expected-warning {{Clang extension}}
}
void badCalls(void) {

View File

@ -1107,6 +1107,7 @@ if( LLVM_INCLUDE_UTILS )
add_subdirectory(utils/PerfectShuffle)
add_subdirectory(utils/count)
add_subdirectory(utils/not)
add_subdirectory(utils/UnicodeData)
add_subdirectory(utils/yaml-bench)
else()
if ( LLVM_INCLUDE_TESTS )

View File

@ -14,6 +14,10 @@
#ifndef LLVM_SUPPORT_UNICODE_H
#define LLVM_SUPPORT_UNICODE_H
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include <string>
namespace llvm {
class StringRef;
@ -63,6 +67,30 @@ int columnWidthUTF8(StringRef Text);
/// rules.
int foldCharSimple(int C);
/// Maps the name or the alias of a Unicode character to its associated
/// codepoints.
/// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
/// For compatibility with the semantics of named character escape sequences in
/// C++, this mapping does an exact match sensitive to casing and spacing.
/// \return The codepoint of the corresponding character, if any.
Optional<char32_t> nameToCodepointStrict(StringRef Name);
struct LooseMatchingResult {
char32_t CodePoint;
SmallString<64> Name;
};
Optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
struct MatchForCodepointName {
std::string Name;
uint32_t Distance = 0;
char32_t Value = 0;
};
SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
} // namespace unicode
} // namespace sys
} // namespace llvm

View File

@ -221,6 +221,8 @@ add_llvm_component_library(LLVMSupport
TypeSize.cpp
Unicode.cpp
UnicodeCaseFold.cpp
UnicodeNameToCodepoint.cpp
UnicodeNameToCodepointGenerated.cpp
VersionTuple.cpp
VirtualFileSystem.cpp
WithColor.cpp

View File

@ -0,0 +1,551 @@
//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
//-*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements functions to map the name or alias of a unicode
// character to its codepoint.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Unicode.h"
namespace llvm {
namespace sys {
namespace unicode {
extern const char *UnicodeNameToCodepointDict;
extern const uint8_t *UnicodeNameToCodepointIndex;
extern const std::size_t UnicodeNameToCodepointIndexSize;
extern const std::size_t UnicodeNameToCodepointLargestNameSize;
using BufferType = SmallString<64>;
struct Node {
bool IsRoot = false;
char32_t Value = 0xFFFFFFFF;
uint32_t ChildrenOffset = 0;
bool HasSibling = false;
uint32_t Size = 0;
StringRef Name;
const Node *Parent = nullptr;
constexpr bool isValid() const {
return !Name.empty() || Value == 0xFFFFFFFF;
}
constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
std::string fullName() const {
std::string S;
// Reserve enough space for most unicode code points.
// The chosen value represent the 99th percentile of name size as of
// Unicode 14.
S.reserve(46);
const Node *N = this;
while (N) {
std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
N = N->Parent;
}
std::reverse(S.begin(), S.end());
return S;
}
};
static Node createRoot() {
Node N;
N.IsRoot = true;
N.ChildrenOffset = 1;
N.Size = 1;
return N;
}
static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
if (Offset == 0)
return createRoot();
uint32_t Origin = Offset;
Node N;
N.Parent = Parent;
uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
return N;
bool LongName = NameInfo & 0x40;
bool HasValue = NameInfo & 0x80;
std::size_t Size = NameInfo & ~0xC0;
if (LongName) {
uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
NameOffset |= UnicodeNameToCodepointIndex[Offset++];
N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
} else {
N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
}
if (HasValue) {
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
uint8_t M = UnicodeNameToCodepointIndex[Offset++];
uint8_t L = UnicodeNameToCodepointIndex[Offset++];
N.Value = ((H << 16) | (M << 8) | L) >> 3;
bool HasChildren = L & 0x02;
N.HasSibling = L & 0x01;
if (HasChildren) {
N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
}
} else {
uint8_t H = UnicodeNameToCodepointIndex[Offset++];
N.HasSibling = H & 0x80;
bool HasChildren = H & 0x40;
H &= ~0xC0;
if (HasChildren) {
N.ChildrenOffset = (H << 16);
N.ChildrenOffset |=
(uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
}
}
N.Size = Offset - Origin;
return N;
}
static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
std::size_t &Consummed, char &PreviousCharInName,
char &PreviousCharInNeedle, bool IsPrefix = false) {
Consummed = 0;
if (Strict) {
if (!Name.startswith(Needle))
return false;
Consummed = Needle.size();
return true;
}
if (Needle.empty())
return true;
auto NamePos = Name.begin();
auto NeedlePos = Needle.begin();
char PreviousCharInNameOrigin = PreviousCharInName;
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
bool IgnoreEnd = false) {
while (It != End) {
const auto Next = std::next(It);
// Ignore spaces, underscore, medial hyphens
// https://unicode.org/reports/tr44/#UAX44-LM2.
bool Ignore =
*It == ' ' || *It == '_' ||
(*It == '-' && isAlnum(PreviousChar) &&
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
PreviousChar = *It;
if (!Ignore)
break;
++It;
}
return It;
};
while (true) {
NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
NeedlePos =
IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
if (NeedlePos == Needle.end())
break;
if (NamePos == Name.end())
break;
if (toUpper(*NeedlePos) != toUpper(*NamePos))
break;
NeedlePos++;
NamePos++;
}
Consummed = std::distance(Name.begin(), NamePos);
if (NeedlePos != Needle.end()) {
PreviousCharInName = PreviousCharInNameOrigin;
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
}
return NeedlePos == Needle.end();
}
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict,
char PreviousCharInName, char PreviousCharInNeedle,
BufferType &Buffer, const Node *Parent = nullptr) {
Node N = readNode(Offset, Parent);
std::size_t Consummed = 0;
bool DoesStartWith =
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName, PreviousCharInNeedle);
if (!DoesStartWith)
return {N, false, 0};
if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
return {N, true, N.Value};
if (N.hasChildren()) {
uint32_t ChildOffset = N.ChildrenOffset;
for (;;) {
Node C;
bool Matches;
uint32_t Value;
std::tie(C, Matches, Value) =
compareNode(ChildOffset, Name.substr(Consummed), Strict,
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
if (Matches) {
std::reverse_copy(C.Name.begin(), C.Name.end(),
std::back_inserter(Buffer));
return {N, true, Value};
}
ChildOffset += C.Size;
if (!C.HasSibling)
break;
}
}
return {N, false, 0};
}
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
}
// clang-format off
constexpr const char *const HangulSyllables[][3] = {
{ "G", "A", "" },
{ "GG", "AE", "G" },
{ "N", "YA", "GG" },
{ "D", "YAE", "GS" },
{ "DD", "EO", "N", },
{ "R", "E", "NJ" },
{ "M", "YEO", "NH" },
{ "B", "YE", "D" },
{ "BB", "O", "L" },
{ "S", "WA", "LG" },
{ "SS", "WAE", "LM" },
{ "", "OE", "LB" },
{ "J", "YO", "LS" },
{ "JJ", "U", "LT" },
{ "C", "WEO", "LP" },
{ "K", "WE", "LH" },
{ "T", "WI", "M" },
{ "P", "YU", "B" },
{ "H", "EU", "BS" },
{ 0, "YI", "S" },
{ 0, "I", "SS" },
{ 0, 0, "NG" },
{ 0, 0, "J" },
{ 0, 0, "C" },
{ 0, 0, "K" },
{ 0, 0, "T" },
{ 0, 0, "P" },
{ 0, 0, "H" }
};
// clang-format on
// Unicode 14.0
// 3.12 Conjoining Jamo Behavior Common constants
constexpr const char32_t SBase = 0xAC00;
constexpr const uint32_t LCount = 19;
constexpr const uint32_t VCount = 21;
constexpr const uint32_t TCount = 28;
static std::size_t findSyllable(StringRef Name, bool Strict,
char &PreviousInName, int &Pos, int Column) {
assert(Column == 0 || Column == 1 || Column == 2);
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
char NeedleStart = 0;
int Len = -1;
int Prev = PreviousInName;
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
StringRef Syllable(HangulSyllables[I][Column]);
if (int(Syllable.size()) <= Len)
continue;
std::size_t Consummed = 0;
char PreviousInNameCopy = PreviousInName;
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
PreviousInNameCopy, NeedleStart);
if (!DoesStartWith)
continue;
Len = Consummed;
Pos = I;
Prev = PreviousInNameCopy;
}
if (Len == -1)
return 0;
PreviousInName = Prev;
return size_t(Len);
}
static llvm::Optional<char32_t>
nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
Buffer.clear();
// Hangul Syllable Decomposition
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
NameStart, NeedleStart);
if (!DoesStartWith)
return None;
Name = Name.substr(Consummed);
int L = -1, V = -1, T = -1;
Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
if (L != -1 && V != -1 && T != -1 && Name.empty()) {
if (!Strict) {
Buffer.append("HANGUL SYLLABLE ");
if (L != -1)
Buffer.append(HangulSyllables[L][0]);
if (V != -1)
Buffer.append(HangulSyllables[V][1]);
if (T != -1)
Buffer.append(HangulSyllables[T][2]);
}
return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
std::uint32_t(T);
}
// Otherwise, it's an illegal syllable name.
return None;
}
struct GeneratedNamesData {
StringRef Prefix;
uint32_t Start;
uint32_t End;
};
// Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
// This needs to be kept in sync with
// llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
static const GeneratedNamesData GeneratedNamesDataTable[] = {
{"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
{"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
{"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
{"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
{"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
{"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
{"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
{"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
{"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
{"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
{"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
{"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
{"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
{"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
{"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
};
static llvm::Optional<char32_t>
nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
for (auto &&Item : GeneratedNamesDataTable) {
Buffer.clear();
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
NameStart, NeedleStart, /*isPrefix*/ true);
if (!DoesStartWith)
continue;
auto Number = Name.substr(Consummed);
unsigned long long V = 0;
// Be consistent about mandating upper casing.
if (Strict &&
llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
return {};
if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
continue;
if (!Strict) {
Buffer.append(Item.Prefix);
Buffer.append(utohexstr(V, true));
}
return V;
}
return None;
}
static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
BufferType &Buffer) {
if (Name.empty())
return None;
llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
if (!Res)
Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
if (Res)
return *Res;
Buffer.clear();
Node Node;
bool Matches;
uint32_t Value;
std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
if (Matches) {
std::reverse(Buffer.begin(), Buffer.end());
// UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
// hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
if (!Strict && Value == 0x116c &&
Name.find_insensitive("O-E") != StringRef::npos) {
Buffer = "HANGUL JUNGSEONG O-E";
Value = 0x1180;
}
return Value;
}
return None;
}
llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
BufferType Buffer;
auto Opt = nameToCodepoint(Name, true, Buffer);
return Opt;
}
llvm::Optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name) {
BufferType Buffer;
auto Opt = nameToCodepoint(Name, false, Buffer);
if (!Opt)
return None;
return LooseMatchingResult{*Opt, Buffer};
}
// Find the unicode character whose editing distance to Pattern
// is shortest, using the WagnerFischer algorithm.
llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
// We maintain a fixed size vector of matches,
// sorted by distance
// The worst match (with the biggest distance) are discarded when new elements
// are added.
std::size_t LargestEditDistance = 0;
llvm::SmallVector<MatchForCodepointName> Matches;
Matches.reserve(MaxMatchesCount + 1);
auto Insert = [&](const Node &Node, uint32_t Distance,
char32_t Value) -> bool {
if (Distance > LargestEditDistance) {
if (Matches.size() == MaxMatchesCount)
return false;
LargestEditDistance = Distance;
}
// To avoid allocations, the creation of the name is delayed
// as much as possible.
std::string Name;
auto GetName = [&] {
if (Name.empty())
Name = Node.fullName();
return Name;
};
auto It = std::lower_bound(
Matches.begin(), Matches.end(), Distance,
[&](const MatchForCodepointName &a, std::size_t Distance) {
if (Distance == a.Distance)
return a.Name < GetName();
return a.Distance < Distance;
});
if (It == Matches.end() && Matches.size() == MaxMatchesCount)
return false;
MatchForCodepointName M{GetName(), Distance, Value};
Matches.insert(It, std::move(M));
if (Matches.size() > MaxMatchesCount)
Matches.pop_back();
return true;
};
// We ignore case, space, hyphens, etc,
// in both the search pattern and the prospective names.
auto Normalize = [](StringRef Name) {
std::string Out;
Out.reserve(Name.size());
for (char C : Name) {
if (isAlnum(C))
Out.push_back(toUpper(C));
}
return Out;
};
std::string NormalizedName = Normalize(Pattern);
// Allocate a matrix big enough for longest names.
const std::size_t Columns =
std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
1;
LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
UnicodeNameToCodepointLargestNameSize + 1;
std::vector<char> Distances(
Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
assert(Column < Columns);
assert(Row < Rows);
return Distances[Row * Columns + Column];
};
for (std::size_t I = 0; I < Columns; I++)
Get(I, 0) = I;
// Visit the childrens,
// Filling (and overriding) the matrix for the name fragment of each node
// iteratively. CompleteName is used to collect the actual name of potential
// match, respecting case and spacing.
auto VisitNode = [&](const Node &N, std::size_t Row,
auto &VisitNode) -> void {
std::size_t J = 0;
for (; J < N.Name.size(); J++) {
if (!isAlnum(N.Name[J]))
continue;
Get(0, Row) = Row;
for (std::size_t I = 1; I < Columns; I++) {
const int Delete = Get(I - 1, Row) + 1;
const int Insert = Get(I, Row - 1) + 1;
const int Replace =
Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
}
Row++;
}
unsigned Cost = Get(Columns - 1, Row - 1);
if (N.Value != 0xFFFFFFFF) {
Insert(N, Cost, N.Value);
}
if (N.hasChildren()) {
auto ChildOffset = N.ChildrenOffset;
for (;;) {
Node C = readNode(ChildOffset, &N);
ChildOffset += C.Size;
if (!C.isValid())
break;
VisitNode(C, Row, VisitNode);
if (!C.HasSibling)
break;
}
}
};
Node Root = createRoot();
VisitNode(Root, 1, VisitNode);
return Matches;
}
} // namespace unicode
} // namespace sys
} // namespace llvm

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,10 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Unicode.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/edit_distance.h"
#include "llvm/Support/ConvertUTF.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace llvm {
@ -101,6 +104,318 @@ TEST(Unicode, isPrintable) {
}
}
TEST(Unicode, nameToCodepointStrict) {
auto map = [](StringRef Str) {
return nameToCodepointStrict(Str).getValueOr(0xFFFF'FFFF);
};
// generated codepoints
EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
EXPECT_EQ(0x02235u, map("BECAUSE"));
EXPECT_EQ(0x1F514u, map("BELL"));
EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
// Aliases
EXPECT_EQ(0x0000u, map("NULL"));
EXPECT_EQ(0x0007u, map("ALERT"));
EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
EXPECT_EQ(0x000Au, map("LINE FEED"));
EXPECT_EQ(0x000Au, map("NEW LINE"));
EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
EXPECT_EQ(0x2118u,
map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
// Should perform exact case match
EXPECT_EQ(0xFFFFFFFFu, map(""));
EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
EXPECT_EQ(0xFFFFFFFFu, map("unicorn face"));
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE"));
EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE"));
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i"));
EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i"));
EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI"));
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE"));
// Should not support abbreviations or figments
EXPECT_EQ(0xFFFFFFFFu, map("FVS1"));
EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET"));
EXPECT_EQ(0xFFFFFFFFu, map("BEL"));
}
TEST(Unicode, nameToCodepointLoose) {
auto map = [](StringRef Str) {
auto Opt = nameToCodepointLooseMatching(Str);
if (!Opt)
return char32_t(0xFFFF'FFFF);
return Opt->CodePoint;
};
// generated codepoints
EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
EXPECT_EQ(0x02235u, map("BECAUSE"));
EXPECT_EQ(0x1F514u, map("BELL"));
EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
// Aliases
EXPECT_EQ(0x0000u, map("NULL"));
EXPECT_EQ(0x0007u, map("ALERT"));
EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
EXPECT_EQ(0x000Au, map("LINE FEED"));
EXPECT_EQ(0x000Au, map("NEW LINE"));
EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
EXPECT_EQ(0x2118u,
map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate
// Should perform loose matching
EXPECT_EQ(0xFFFFFFFFu, map(""));
EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
EXPECT_EQ(0x0001F984u, map("unicorn face"));
EXPECT_EQ(0x0001F984u, map("UNICORN FaCE"));
EXPECT_EQ(0x0001F984u, map("UNICORNFaCE"));
EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i"));
EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i"));
EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI"));
EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
// https://unicode.org/reports/tr44/#Matching_Names
// UAX44-LM2: Medial hypens are ignored, non medial hyphens are not
EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E"));
EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE"));
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-"));
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -"));
EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --"));
EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE"));
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A"));
EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA"));
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
;
// special case
EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE"));
// names that are prefix to existing characters should not match
EXPECT_FALSE(nameToCodepointLooseMatching("B"));
EXPECT_FALSE(nameToCodepointLooseMatching("BE"));
EXPECT_FALSE(nameToCodepointLooseMatching("BEE"));
EXPECT_FALSE(nameToCodepointLooseMatching("BEET"));
EXPECT_FALSE(nameToCodepointLooseMatching("BEETL"));
EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE"));
}
} // namespace
bool operator==(MatchForCodepointName a, MatchForCodepointName b) {
return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value;
}
namespace {
TEST(Unicode, nearestMatchesForCodepointName) {
auto Normalize = [](StringRef Name) {
std::string Out;
Out.reserve(Name.size());
for (char C : Name) {
if (isAlnum(C))
Out.push_back(toUpper(C));
}
return Out;
};
auto L = [&](StringRef name) {
auto v = nearestMatchesForCodepointName(name, 3);
for (auto &r : v) {
auto A = Normalize(r.Name);
auto B = Normalize(name);
EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance);
}
return v;
};
using ::testing::ElementsAre;
using M = MatchForCodepointName;
ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C},
M{"ARC", 3, 0x2312}));
// shortest name
ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93},
M{"BOY", 2, 0x1F466}));
// longest name
ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF "
"MAKSURA INITIAL FORM"),
ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA INITIAL FORM",
0, 0xFBFB},
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA FINAL FORM",
4, 0xFBFA},
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA ISOLATED FORM",
7, 0xFBF9}));
// same result with underscore, spaces, etc
ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH "
"ALEF MAKsURAINITIAL form_"),
ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA INITIAL FORM",
0, 0xFBFB},
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA FINAL FORM",
4, 0xFBFA},
M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
"ABOVE WITH ALEF MAKSURA ISOLATED FORM",
7, 0xFBF9}));
ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"),
ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
ASSERT_THAT(L("greekcapitalletter-lambda"),
ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
// typo http://www.unicode.org/notes/tn27/tn27-5.html
ASSERT_THAT(
L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"),
ElementsAre(
M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0,
0xFE18}, // typo
M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2,
0xFE18}, // correction
M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6,
0xFE17}));
// typo http://www.unicode.org/notes/tn27/tn27-5.html
ASSERT_THAT(
L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"),
ElementsAre(
M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5},
M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5},
M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7,
0x1D0C6}));
}
} // namespace
} // namespace unicode
} // namespace sys

View File

@ -0,0 +1,5 @@
set(LLVM_LINK_COMPONENTS Support)
add_llvm_utility(UnicodeNameMappingGenerator
UnicodeNameMappingGenerator.cpp
)

View File

@ -0,0 +1,486 @@
//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
// using UnicodeData.txt and NameAliases.txt available at
// https://unicode.org/Public/14.0.0/ucd/
//===----------------------------------------------------------------------===//
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include <algorithm>
#include <array>
#include <deque>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
static const llvm::StringRef Letters =
" _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
// Collect names UnicodeData.txt and AliasNames.txt
// There may be multiple names per code points.
static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
std::unordered_multimap<char32_t, std::string> CollectedCharacters;
auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
std::ifstream InputFile(File);
for (std::string Line; getline(InputFile, Line);) {
if (Line.empty() || !isxdigit(Line[0]))
continue;
auto FirstSemiPos = Line.find(';');
if (FirstSemiPos == std::string::npos)
continue;
auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
if (FirstSemiPos == std::string::npos)
continue;
unsigned long long CodePoint;
if (llvm::getAsUnsignedInteger(
llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
continue;
}
std::string Name =
Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
if (!Name.empty() && Name[0] == '<') {
// Ignore ranges of characters, as their name is either absent or
// generated.
continue;
}
// Some aliases are ignored for compatibility with C++
if (IsAliasFile) {
std::string Kind = Line.substr(SecondSemiPos + 1);
if (Kind != "control" && Kind != "correction" && Kind != "alternate")
continue;
}
auto InsertUnique = [&](char32_t CP, std::string Name) {
auto It = CollectedCharacters.find(CP);
while (It != std::end(CollectedCharacters) && It->first == CP) {
if (It->second == Name)
return;
++It;
}
CollectedCharacters.insert({CP, std::move(Name)});
};
InsertUnique(CodePoint, std::move(Name));
}
};
FromFile(NamesFile);
FromFile(AliasesFile, true);
return CollectedCharacters;
}
class Trie {
struct Node;
public:
// When inserting named codepoint
// We create a node per character in the name.
// SPARKLE becomes S <- P <- A <- R <- K <- L <- E
// Once all characters are inserted, the tree is compacted
void insert(llvm::StringRef Name, char32_t Codepoint) {
Node *N = Root.get();
for (auto Ch : Name) {
std::string Label(1, Ch);
auto It = std::find_if(N->Children.begin(), N->Children.end(),
[&](const auto &C) { return C->Name == Label; });
if (It == N->Children.end()) {
It = N->Children.insert(It, std::make_unique<Node>(Label, N));
}
N = It->get();
}
N->Value = Codepoint;
}
void compact() { compact(Root.get()); }
// This creates 2 arrays of bytes from the tree:
// A serialized dictionary of node labels,
// And the nodes themselves.
// The name of each label is found by indexing into the dictionary.
// The longest names are inserted first into the dictionary,
// in the hope it will contain shorter labels as substring,
// thereby reducing duplication.
// We could theorically be more clever by trying to minimizing the size
// of the dictionary.
std::pair<std::string, std::vector<uint8_t>> serialize() {
std::set<std::string> Names = this->getNameFragments();
std::vector<std::string> Sorted(Names.begin(), Names.end());
std::sort(Sorted.begin(), Sorted.end(),
[](const auto &a, const auto &b) { return a.size() > b.size(); });
std::string Dict(Letters.begin(), Letters.end());
Dict.reserve(50000);
for (const std::string &Name : Sorted) {
if (Name.size() <= 1)
continue;
if (Dict.find(Name) != std::string::npos)
continue;
Dict += Name;
}
if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
fprintf(stderr, "Dictionary too big to be serialized");
exit(1);
}
auto Bytes = dumpIndex(Dict);
return {Dict, Bytes};
}
std::set<std::string> getNameFragments() {
std::set<std::string> Keys;
collectKeys(Root.get(), Keys);
return Keys;
}
// Maps a valid char in an Unicode character name
// To a 6 bits index.
static uint8_t letter(char C) {
auto Pos = Letters.find(C);
assert(Pos != std::string::npos &&
"Invalid letter in Unicode character name");
return Pos;
}
// clang-format off
// +================+============+======================+=============+========+===+==============+===============+
// | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
// +================+============+======================+=============+========+===+==============+===============+
// | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
// +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
// clang-format on
std::vector<uint8_t> dumpIndex(const std::string &Dict) {
struct ChildrenOffset {
Node *FirstChild;
std::size_t Offset;
bool HasValue;
};
// Keep track of the start of each node
// position in the serialized data.
std::unordered_map<Node *, int32_t> Offsets;
// Keep track of where to write the index
// of the first children
std::vector<ChildrenOffset> ChildrenOffsets;
std::unordered_map<Node *, bool> SiblingTracker;
std::deque<Node *> AllNodes;
std::vector<uint8_t> Bytes;
Bytes.reserve(250'000);
// This leading byte is used by the reading code to detect the root node.
Bytes.push_back(0);
auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
for (std::size_t Index = 0; Index < Children.size(); Index++) {
const std::unique_ptr<Node> &Child = Children[Index];
AllNodes.push_back(Child.get());
if (Index != Children.size() - 1)
SiblingTracker[Child.get()] = true;
}
};
CollectChildren(Root->Children);
while (!AllNodes.empty()) {
const std::size_t Offset = Bytes.size();
Node *const N = AllNodes.front();
AllNodes.pop_front();
assert(!N->Name.empty());
Offsets[N] = Offset;
uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
// Single letter node are indexed in 6 bits
if (N->Name.size() == 1) {
FirstByte |= letter(N->Name[0]);
Bytes.push_back(FirstByte);
} else {
// Otherwise we use a 16 bits index
FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
Bytes.push_back(FirstByte);
auto PosInDict = Dict.find(N->Name);
assert(PosInDict != std::string::npos);
uint8_t Low = PosInDict;
uint8_t High = ((PosInDict >> 8) & 0xFF);
Bytes.push_back(High);
Bytes.push_back(Low);
}
const bool HasSibling = SiblingTracker.count(N) != 0;
const bool HasChildren = N->Children.size() != 0;
if (!!N->Value) {
uint32_t Value = (*(N->Value) << 3);
uint8_t H = ((Value >> 16) & 0xFF);
uint8_t M = ((Value >> 8) & 0xFF);
uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
uint8_t(HasChildren ? 0x02 : 0);
Bytes.push_back(H);
Bytes.push_back(M);
Bytes.push_back(L);
if (HasChildren) {
ChildrenOffsets.push_back(
ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
// index of the first children
Bytes.push_back(0x00);
Bytes.push_back(0x00);
Bytes.push_back(0x00);
}
} else {
// When there is no value (that's most intermediate nodes)
// Dispense of the 3 values bytes, and only store
// 1 byte to track whether the node has sibling and chidren
// + 2 bytes for the index of the first children if necessary.
// That index also uses bytes 0-6 of the previous byte.
uint8_t Byte =
uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
Bytes.push_back(Byte);
if (HasChildren) {
ChildrenOffsets.emplace_back(
ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
Bytes.push_back(0x00);
Bytes.push_back(0x00);
}
}
CollectChildren(N->Children);
}
// Once all the nodes are in the inndex
// Fill the bytes we left to indicate the position
// of the children
for (const ChildrenOffset &Parent : ChildrenOffsets) {
const auto It = Offsets.find(Parent.FirstChild);
assert(It != Offsets.end());
std::size_t Pos = It->second;
if (Parent.HasValue) {
Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
} else {
Bytes[Parent.Offset] =
Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
}
Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
Bytes[Parent.Offset + 2] = Pos & 0xFF;
}
// Add some padding so that the deserialization code
// doesn't try to read past the enf of the array.
Bytes.push_back(0);
Bytes.push_back(0);
Bytes.push_back(0);
Bytes.push_back(0);
Bytes.push_back(0);
Bytes.push_back(0);
return Bytes;
}
private:
void collectKeys(Node *N, std::set<std::string> &Keys) {
Keys.insert(N->Name);
for (const std::unique_ptr<Node> &Child : N->Children) {
collectKeys(Child.get(), Keys);
}
}
// Merge sequences of 1-character nodes
// This greatly reduce the total number of nodes,
// and therefore the size of the index.
// When the tree gets serialized, we only have 5 bytes to store the
// size of a name. Overlong names (>32 characters) are therefore
// kep into separate nodes
void compact(Node *N) {
for (auto &&Child : N->Children) {
compact(Child.get());
}
if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
(N->Parent->Name.size() + N->Name.size() <= 32)) {
N->Parent->Value = N->Value;
N->Parent->Name += N->Name;
N->Parent->Children = std::move(N->Children);
for (std::unique_ptr<Node> &c : N->Parent->Children) {
c->Parent = N->Parent;
}
}
}
struct Node {
Node(std::string Name, Node *Parent = nullptr)
: Name(Name), Parent(Parent) {}
std::vector<std::unique_ptr<Node>> Children;
std::string Name;
Node *Parent = nullptr;
llvm::Optional<char32_t> Value;
};
std::unique_ptr<Node> Root = std::make_unique<Node>("");
};
extern const char *UnicodeLicense;
int main(int argc, char **argv) {
printf("Unicode name -> codepoint mapping generator\n"
"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
argv[0]);
printf("NameAliases.txt can be found at "
"https://unicode.org/Public/14.0.0/ucd/NameAliases.txt\n"
"UnicodeData.txt can be found at "
"https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt\n\n");
if (argc != 4)
return EXIT_FAILURE;
FILE *Out = fopen(argv[3], "w");
if (!Out) {
printf("Error creating output file.\n");
return EXIT_FAILURE;
}
Trie T;
uint32_t NameCount = 0;
std::size_t LongestName = 0;
auto Entries = loadDataFiles(argv[1], argv[2]);
for (const std::pair<const char32_t, std::string> &Entry : Entries) {
char32_t Codepoint = Entry.first;
const std::string &Name = Entry.second;
// Ignore names which are not valid.
if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) {
return llvm::is_contained(Letters, C);
})) {
continue;
}
printf("%06x: %s\n", Codepoint, Name.c_str());
T.insert(Name, Codepoint);
LongestName =
std::max(LongestName, std::size_t(llvm::count_if(Name, [](char c) {
return llvm::isAlnum(c);
})));
NameCount++;
}
T.compact();
std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
const std::string &Dict = Data.first;
const std::vector<uint8_t> &Tree = Data.second;
fprintf(Out, R"(
//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements mapping the name of a unicode code point to its value.
//
// This file was generated using %s.
// Do not edit manually.
//
//===----------------------------------------------------------------------===//
%s
#include "llvm/Support/Compiler.h"
#include <cstddef>
#include <cstdint>
)",
argv[0], UnicodeLicense);
fprintf(Out,
"namespace llvm { namespace sys { namespace unicode { \n"
"extern const char *UnicodeNameToCodepointDict;\n"
"extern const uint8_t *UnicodeNameToCodepointIndex;\n"
"extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
"extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
Dict.c_str());
fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%lu] = {\n",
Tree.size() + 1);
for (auto Byte : Tree) {
fprintf(Out, "0x%02x,", Byte);
}
fprintf(Out, "0};");
fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
"UnicodeNameToCodepointIndex_; \n");
fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %lu;\n",
Tree.size() + 1);
fprintf(Out,
"const std::size_t UnicodeNameToCodepointLargestNameSize = %lu;\n",
LongestName);
fprintf(Out, "\n}}}\n");
fclose(Out);
printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
}
const char *UnicodeLicense = R"(
/*
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
See Terms of Use <https://www.unicode.org/copyright.html>
for definitions of Unicode Inc.s Data Files and Software.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT.
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
THE DATA FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2022 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
*/
)";