mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-25 10:56:06 +00:00

HTML starting tags that span multiple lines were previously not allowed (or rather, only the starting line was lexed as HTML). Doxygen allows those tags. This PR allows the starting tags to span multiple lines. They can't span multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL comments are fine as those are single lines (shown below). Example: ```c /// <a /// href="foo" /// >Aaa</a>b int Test; ``` Fixes #28321.
920 lines
27 KiB
C++
920 lines
27 KiB
C++
//===--- CommentLexer.cpp -------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "clang/AST/CommentLexer.h"
|
|
#include "clang/AST/CommentCommandTraits.h"
|
|
#include "clang/Basic/CharInfo.h"
|
|
#include "clang/Basic/DiagnosticComment.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/ADT/StringSwitch.h"
|
|
#include "llvm/Support/ConvertUTF.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
namespace clang {
|
|
namespace comments {
|
|
|
|
void Token::dump(const Lexer &L, const SourceManager &SM) const {
|
|
llvm::errs() << "comments::Token Kind=" << Kind << " ";
|
|
Loc.print(llvm::errs(), SM);
|
|
llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
|
|
}
|
|
|
|
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
|
|
return isLetter(C);
|
|
}
|
|
|
|
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
|
|
return isDigit(C);
|
|
}
|
|
|
|
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
|
|
return isHexDigit(C);
|
|
}
|
|
|
|
static inline StringRef convertCodePointToUTF8(
|
|
llvm::BumpPtrAllocator &Allocator,
|
|
unsigned CodePoint) {
|
|
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
|
|
char *ResolvedPtr = Resolved;
|
|
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
|
|
return StringRef(Resolved, ResolvedPtr - Resolved);
|
|
else
|
|
return StringRef();
|
|
}
|
|
|
|
namespace {
|
|
|
|
#include "clang/AST/CommentHTMLTags.inc"
|
|
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
|
|
|
|
} // end anonymous namespace
|
|
|
|
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
|
|
// Fast path, first check a few most widely used named character references.
|
|
return llvm::StringSwitch<StringRef>(Name)
|
|
.Case("amp", "&")
|
|
.Case("lt", "<")
|
|
.Case("gt", ">")
|
|
.Case("quot", "\"")
|
|
.Case("apos", "\'")
|
|
// Slow path.
|
|
.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
|
|
}
|
|
|
|
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
|
|
unsigned CodePoint = 0;
|
|
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
|
|
assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
|
|
CodePoint *= 10;
|
|
CodePoint += Name[i] - '0';
|
|
}
|
|
return convertCodePointToUTF8(Allocator, CodePoint);
|
|
}
|
|
|
|
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
|
|
unsigned CodePoint = 0;
|
|
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
|
|
CodePoint *= 16;
|
|
const char C = Name[i];
|
|
assert(isHTMLHexCharacterReferenceCharacter(C));
|
|
CodePoint += llvm::hexDigitValue(C);
|
|
}
|
|
return convertCodePointToUTF8(Allocator, CodePoint);
|
|
}
|
|
|
|
void Lexer::skipLineStartingDecorations() {
|
|
// This function should be called only for C comments
|
|
assert(CommentState == LCS_InsideCComment);
|
|
|
|
if (BufferPtr == CommentEnd)
|
|
return;
|
|
|
|
const char *NewBufferPtr = BufferPtr;
|
|
while (isHorizontalWhitespace(*NewBufferPtr))
|
|
if (++NewBufferPtr == CommentEnd)
|
|
return;
|
|
if (*NewBufferPtr == '*')
|
|
BufferPtr = NewBufferPtr + 1;
|
|
}
|
|
|
|
namespace {
|
|
/// Returns pointer to the first newline character in the string.
|
|
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (isVerticalWhitespace(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
|
|
if (BufferPtr == BufferEnd)
|
|
return BufferPtr;
|
|
|
|
if (*BufferPtr == '\n')
|
|
BufferPtr++;
|
|
else {
|
|
assert(*BufferPtr == '\r');
|
|
BufferPtr++;
|
|
if (BufferPtr != BufferEnd && *BufferPtr == '\n')
|
|
BufferPtr++;
|
|
}
|
|
return BufferPtr;
|
|
}
|
|
|
|
const char *skipNamedCharacterReference(const char *BufferPtr,
|
|
const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
const char *skipDecimalCharacterReference(const char *BufferPtr,
|
|
const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
const char *skipHexCharacterReference(const char *BufferPtr,
|
|
const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
bool isHTMLIdentifierStartingCharacter(char C) {
|
|
return isLetter(C);
|
|
}
|
|
|
|
bool isHTMLIdentifierCharacter(char C) {
|
|
return isAlphanumeric(C);
|
|
}
|
|
|
|
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isHTMLIdentifierCharacter(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
|
|
/// string allowed.
|
|
///
|
|
/// Returns pointer to closing quote.
|
|
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
|
|
{
|
|
const char Quote = *BufferPtr;
|
|
assert(Quote == '\"' || Quote == '\'');
|
|
|
|
BufferPtr++;
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
const char C = *BufferPtr;
|
|
if (C == Quote && BufferPtr[-1] != '\\')
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isWhitespace(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
const char *skipHorizontalWhitespace(const char *BufferPtr,
|
|
const char *BufferEnd) {
|
|
for (; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isHorizontalWhitespace(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
|
|
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
|
|
}
|
|
|
|
bool isCommandNameStartCharacter(char C) {
|
|
return isLetter(C);
|
|
}
|
|
|
|
bool isCommandNameCharacter(char C) {
|
|
return isAlphanumeric(C);
|
|
}
|
|
|
|
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (!isCommandNameCharacter(*BufferPtr))
|
|
return BufferPtr;
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
/// Return the one past end pointer for BCPL comments.
|
|
/// Handles newlines escaped with backslash or trigraph for backslahs.
|
|
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
|
|
const char *CurPtr = BufferPtr;
|
|
while (CurPtr != BufferEnd) {
|
|
while (!isVerticalWhitespace(*CurPtr)) {
|
|
CurPtr++;
|
|
if (CurPtr == BufferEnd)
|
|
return BufferEnd;
|
|
}
|
|
// We found a newline, check if it is escaped.
|
|
const char *EscapePtr = CurPtr - 1;
|
|
while(isHorizontalWhitespace(*EscapePtr))
|
|
EscapePtr--;
|
|
|
|
if (*EscapePtr == '\\' ||
|
|
(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
|
|
EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
|
|
// We found an escaped newline.
|
|
CurPtr = skipNewline(CurPtr, BufferEnd);
|
|
} else
|
|
return CurPtr; // Not an escaped newline.
|
|
}
|
|
return BufferEnd;
|
|
}
|
|
|
|
/// Return the one past end pointer for C comments.
|
|
/// Very dumb, does not handle escaped newlines or trigraphs.
|
|
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
|
|
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
|
|
if (*BufferPtr == '*') {
|
|
assert(BufferPtr + 1 != BufferEnd);
|
|
if (*(BufferPtr + 1) == '/')
|
|
return BufferPtr;
|
|
}
|
|
}
|
|
llvm_unreachable("buffer end hit before '*/' was seen");
|
|
}
|
|
|
|
} // end anonymous namespace
|
|
|
|
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
|
|
tok::TokenKind Kind) {
|
|
const unsigned TokLen = TokEnd - BufferPtr;
|
|
Result.setLocation(getSourceLocation(BufferPtr));
|
|
Result.setKind(Kind);
|
|
Result.setLength(TokLen);
|
|
#ifndef NDEBUG
|
|
Result.TextPtr = "<UNSET>";
|
|
Result.IntVal = 7;
|
|
#endif
|
|
BufferPtr = TokEnd;
|
|
}
|
|
|
|
const char *Lexer::skipTextToken() {
|
|
const char *TokenPtr = BufferPtr;
|
|
assert(TokenPtr < CommentEnd);
|
|
StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
|
|
|
|
again:
|
|
size_t End =
|
|
StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
|
|
if (End == StringRef::npos)
|
|
return CommentEnd;
|
|
|
|
// Doxygen doesn't recognize any commands in a one-line double quotation.
|
|
// If we don't find an ending quotation mark, we pretend it never began.
|
|
if (*(TokenPtr + End) == '\"') {
|
|
TokenPtr += End + 1;
|
|
End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
|
|
if (End != StringRef::npos && *(TokenPtr + End) == '\"')
|
|
TokenPtr += End + 1;
|
|
goto again;
|
|
}
|
|
return TokenPtr + End;
|
|
}
|
|
|
|
void Lexer::lexCommentText(Token &T) {
|
|
assert(CommentState == LCS_InsideBCPLComment ||
|
|
CommentState == LCS_InsideCComment);
|
|
|
|
// Handles lexing non-command text, i.e. text and newline.
|
|
auto HandleNonCommandToken = [&]() -> void {
|
|
assert(State == LS_Normal);
|
|
|
|
const char *TokenPtr = BufferPtr;
|
|
assert(TokenPtr < CommentEnd);
|
|
switch (*TokenPtr) {
|
|
case '\n':
|
|
case '\r':
|
|
TokenPtr = skipNewline(TokenPtr, CommentEnd);
|
|
formTokenWithChars(T, TokenPtr, tok::newline);
|
|
|
|
if (CommentState == LCS_InsideCComment)
|
|
skipLineStartingDecorations();
|
|
return;
|
|
|
|
default:
|
|
return formTextToken(T, skipTextToken());
|
|
}
|
|
};
|
|
|
|
if (!ParseCommands)
|
|
return HandleNonCommandToken();
|
|
|
|
switch (State) {
|
|
case LS_Normal:
|
|
break;
|
|
case LS_VerbatimBlockFirstLine:
|
|
lexVerbatimBlockFirstLine(T);
|
|
return;
|
|
case LS_VerbatimBlockBody:
|
|
lexVerbatimBlockBody(T);
|
|
return;
|
|
case LS_VerbatimLineText:
|
|
lexVerbatimLineText(T);
|
|
return;
|
|
case LS_HTMLStartTag:
|
|
lexHTMLStartTag(T);
|
|
return;
|
|
case LS_HTMLEndTag:
|
|
lexHTMLEndTag(T);
|
|
return;
|
|
}
|
|
|
|
assert(State == LS_Normal);
|
|
const char *TokenPtr = BufferPtr;
|
|
assert(TokenPtr < CommentEnd);
|
|
switch(*TokenPtr) {
|
|
case '\\':
|
|
case '@': {
|
|
// Commands that start with a backslash and commands that start with
|
|
// 'at' have equivalent semantics. But we keep information about the
|
|
// exact syntax in AST for comments.
|
|
tok::TokenKind CommandKind =
|
|
(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
|
|
TokenPtr++;
|
|
if (TokenPtr == CommentEnd) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
char C = *TokenPtr;
|
|
switch (C) {
|
|
default:
|
|
break;
|
|
|
|
case '\\': case '@': case '&': case '$':
|
|
case '#': case '<': case '>': case '%':
|
|
case '\"': case '.': case ':':
|
|
// This is one of \\ \@ \& \$ etc escape sequences.
|
|
TokenPtr++;
|
|
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
|
|
// This is the \:: escape sequence.
|
|
TokenPtr++;
|
|
}
|
|
StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
|
|
formTokenWithChars(T, TokenPtr, tok::text);
|
|
T.setText(UnescapedText);
|
|
return;
|
|
}
|
|
|
|
// Don't make zero-length commands.
|
|
if (!isCommandNameStartCharacter(*TokenPtr)) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
|
|
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
|
|
unsigned Length = TokenPtr - (BufferPtr + 1);
|
|
|
|
// Hardcoded support for lexing LaTeX formula commands
|
|
// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
|
|
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
|
|
C = *TokenPtr;
|
|
if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
|
|
C == '{' || C == '}') {
|
|
TokenPtr++;
|
|
Length++;
|
|
}
|
|
}
|
|
|
|
StringRef CommandName(BufferPtr + 1, Length);
|
|
|
|
const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
|
|
if (!Info) {
|
|
if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
|
|
StringRef CorrectedName = Info->Name;
|
|
SourceLocation Loc = getSourceLocation(BufferPtr);
|
|
SourceLocation EndLoc = getSourceLocation(TokenPtr);
|
|
SourceRange FullRange = SourceRange(Loc, EndLoc);
|
|
SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
|
|
Diag(Loc, diag::warn_correct_comment_command_name)
|
|
<< FullRange << CommandName << CorrectedName
|
|
<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
|
|
} else {
|
|
formTokenWithChars(T, TokenPtr, tok::unknown_command);
|
|
T.setUnknownCommandName(CommandName);
|
|
Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
|
|
<< SourceRange(T.getLocation(), T.getEndLocation());
|
|
return;
|
|
}
|
|
}
|
|
if (Info->IsVerbatimBlockCommand) {
|
|
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
|
|
return;
|
|
}
|
|
if (Info->IsVerbatimLineCommand) {
|
|
setupAndLexVerbatimLine(T, TokenPtr, Info);
|
|
return;
|
|
}
|
|
formTokenWithChars(T, TokenPtr, CommandKind);
|
|
T.setCommandID(Info->getID());
|
|
return;
|
|
}
|
|
|
|
case '&':
|
|
lexHTMLCharacterReference(T);
|
|
return;
|
|
|
|
case '<': {
|
|
TokenPtr++;
|
|
if (TokenPtr == CommentEnd) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
const char C = *TokenPtr;
|
|
if (isHTMLIdentifierStartingCharacter(C))
|
|
setupAndLexHTMLStartTag(T);
|
|
else if (C == '/')
|
|
setupAndLexHTMLEndTag(T);
|
|
else
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
|
|
default:
|
|
return HandleNonCommandToken();
|
|
}
|
|
}
|
|
|
|
void Lexer::setupAndLexVerbatimBlock(Token &T,
|
|
const char *TextBegin,
|
|
char Marker, const CommandInfo *Info) {
|
|
assert(Info->IsVerbatimBlockCommand);
|
|
|
|
VerbatimBlockEndCommandName.clear();
|
|
VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
|
|
VerbatimBlockEndCommandName.append(Info->EndCommandName);
|
|
|
|
formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
|
|
T.setVerbatimBlockID(Info->getID());
|
|
|
|
// If there is a newline following the verbatim opening command, skip the
|
|
// newline so that we don't create an tok::verbatim_block_line with empty
|
|
// text content.
|
|
if (BufferPtr != CommentEnd &&
|
|
isVerticalWhitespace(*BufferPtr)) {
|
|
BufferPtr = skipNewline(BufferPtr, CommentEnd);
|
|
State = LS_VerbatimBlockBody;
|
|
return;
|
|
}
|
|
|
|
State = LS_VerbatimBlockFirstLine;
|
|
}
|
|
|
|
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
|
|
again:
|
|
assert(BufferPtr < CommentEnd);
|
|
|
|
// FIXME: It would be better to scan the text once, finding either the block
|
|
// end command or newline.
|
|
//
|
|
// Extract current line.
|
|
const char *Newline = findNewline(BufferPtr, CommentEnd);
|
|
StringRef Line(BufferPtr, Newline - BufferPtr);
|
|
|
|
// Look for end command in current line.
|
|
size_t Pos = Line.find(VerbatimBlockEndCommandName);
|
|
const char *TextEnd;
|
|
const char *NextLine;
|
|
if (Pos == StringRef::npos) {
|
|
// Current line is completely verbatim.
|
|
TextEnd = Newline;
|
|
NextLine = skipNewline(Newline, CommentEnd);
|
|
} else if (Pos == 0) {
|
|
// Current line contains just an end command.
|
|
const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
|
|
StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
|
|
formTokenWithChars(T, End, tok::verbatim_block_end);
|
|
T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
|
|
State = LS_Normal;
|
|
return;
|
|
} else {
|
|
// There is some text, followed by end command. Extract text first.
|
|
TextEnd = BufferPtr + Pos;
|
|
NextLine = TextEnd;
|
|
// If there is only whitespace before end command, skip whitespace.
|
|
if (isWhitespace(BufferPtr, TextEnd)) {
|
|
BufferPtr = TextEnd;
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
StringRef Text(BufferPtr, TextEnd - BufferPtr);
|
|
formTokenWithChars(T, NextLine, tok::verbatim_block_line);
|
|
T.setVerbatimBlockText(Text);
|
|
|
|
State = LS_VerbatimBlockBody;
|
|
}
|
|
|
|
void Lexer::lexVerbatimBlockBody(Token &T) {
|
|
assert(State == LS_VerbatimBlockBody);
|
|
|
|
if (CommentState == LCS_InsideCComment)
|
|
skipLineStartingDecorations();
|
|
|
|
if (BufferPtr == CommentEnd) {
|
|
formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
|
|
T.setVerbatimBlockText("");
|
|
return;
|
|
}
|
|
|
|
lexVerbatimBlockFirstLine(T);
|
|
}
|
|
|
|
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
|
|
const CommandInfo *Info) {
|
|
assert(Info->IsVerbatimLineCommand);
|
|
formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
|
|
T.setVerbatimLineID(Info->getID());
|
|
|
|
State = LS_VerbatimLineText;
|
|
}
|
|
|
|
void Lexer::lexVerbatimLineText(Token &T) {
|
|
assert(State == LS_VerbatimLineText);
|
|
|
|
// Extract current line.
|
|
const char *Newline = findNewline(BufferPtr, CommentEnd);
|
|
StringRef Text(BufferPtr, Newline - BufferPtr);
|
|
formTokenWithChars(T, Newline, tok::verbatim_line_text);
|
|
T.setVerbatimLineText(Text);
|
|
|
|
State = LS_Normal;
|
|
}
|
|
|
|
void Lexer::lexHTMLCharacterReference(Token &T) {
|
|
const char *TokenPtr = BufferPtr;
|
|
assert(*TokenPtr == '&');
|
|
TokenPtr++;
|
|
if (TokenPtr == CommentEnd) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
const char *NamePtr;
|
|
bool isNamed = false;
|
|
bool isDecimal = false;
|
|
char C = *TokenPtr;
|
|
if (isHTMLNamedCharacterReferenceCharacter(C)) {
|
|
NamePtr = TokenPtr;
|
|
TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
|
|
isNamed = true;
|
|
} else if (C == '#') {
|
|
TokenPtr++;
|
|
if (TokenPtr == CommentEnd) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
C = *TokenPtr;
|
|
if (isHTMLDecimalCharacterReferenceCharacter(C)) {
|
|
NamePtr = TokenPtr;
|
|
TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
|
|
isDecimal = true;
|
|
} else if (C == 'x' || C == 'X') {
|
|
TokenPtr++;
|
|
NamePtr = TokenPtr;
|
|
TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
|
|
} else {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
} else {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
|
|
*TokenPtr != ';') {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
StringRef Name(NamePtr, TokenPtr - NamePtr);
|
|
TokenPtr++; // Skip semicolon.
|
|
StringRef Resolved;
|
|
if (isNamed)
|
|
Resolved = resolveHTMLNamedCharacterReference(Name);
|
|
else if (isDecimal)
|
|
Resolved = resolveHTMLDecimalCharacterReference(Name);
|
|
else
|
|
Resolved = resolveHTMLHexCharacterReference(Name);
|
|
|
|
if (Resolved.empty()) {
|
|
formTextToken(T, TokenPtr);
|
|
return;
|
|
}
|
|
formTokenWithChars(T, TokenPtr, tok::text);
|
|
T.setText(Resolved);
|
|
}
|
|
|
|
void Lexer::setupAndLexHTMLStartTag(Token &T) {
|
|
assert(BufferPtr[0] == '<' &&
|
|
isHTMLIdentifierStartingCharacter(BufferPtr[1]));
|
|
const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
|
|
StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
|
|
if (!isHTMLTagName(Name)) {
|
|
formTextToken(T, TagNameEnd);
|
|
return;
|
|
}
|
|
|
|
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
|
|
T.setHTMLTagStartName(Name);
|
|
|
|
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
|
|
if (BufferPtr == CommentEnd) { // in BCPL comments
|
|
State = LS_HTMLStartTag;
|
|
return;
|
|
}
|
|
|
|
const char C = *BufferPtr;
|
|
if (BufferPtr != CommentEnd &&
|
|
(C == '>' || C == '/' || isVerticalWhitespace(C) ||
|
|
isHTMLIdentifierStartingCharacter(C)))
|
|
State = LS_HTMLStartTag;
|
|
}
|
|
|
|
void Lexer::lexHTMLStartTag(Token &T) {
|
|
assert(State == LS_HTMLStartTag);
|
|
|
|
// Skip leading whitespace and comment decorations
|
|
while (isVerticalWhitespace(*BufferPtr)) {
|
|
BufferPtr = skipNewline(BufferPtr, CommentEnd);
|
|
|
|
if (CommentState == LCS_InsideCComment)
|
|
skipLineStartingDecorations();
|
|
|
|
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
|
|
if (BufferPtr == CommentEnd) {
|
|
// HTML starting tags must be defined in a single comment block.
|
|
// It's likely a user-error where they forgot to terminate the comment.
|
|
State = LS_Normal;
|
|
// Since at least one newline was skipped and one token needs to be lexed,
|
|
// return a newline.
|
|
formTokenWithChars(T, BufferPtr, tok::newline);
|
|
return;
|
|
}
|
|
}
|
|
|
|
const char *TokenPtr = BufferPtr;
|
|
char C = *TokenPtr;
|
|
if (isHTMLIdentifierCharacter(C)) {
|
|
TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
|
|
StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
|
|
formTokenWithChars(T, TokenPtr, tok::html_ident);
|
|
T.setHTMLIdent(Ident);
|
|
} else {
|
|
switch (C) {
|
|
case '=':
|
|
TokenPtr++;
|
|
formTokenWithChars(T, TokenPtr, tok::html_equals);
|
|
break;
|
|
case '\"':
|
|
case '\'': {
|
|
const char *OpenQuote = TokenPtr;
|
|
TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
|
|
const char *ClosingQuote = TokenPtr;
|
|
if (TokenPtr != CommentEnd) // Skip closing quote.
|
|
TokenPtr++;
|
|
formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
|
|
T.setHTMLQuotedString(StringRef(OpenQuote + 1,
|
|
ClosingQuote - (OpenQuote + 1)));
|
|
break;
|
|
}
|
|
case '>':
|
|
TokenPtr++;
|
|
formTokenWithChars(T, TokenPtr, tok::html_greater);
|
|
State = LS_Normal;
|
|
return;
|
|
case '/':
|
|
TokenPtr++;
|
|
if (TokenPtr != CommentEnd && *TokenPtr == '>') {
|
|
TokenPtr++;
|
|
formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
|
|
} else
|
|
formTextToken(T, TokenPtr);
|
|
|
|
State = LS_Normal;
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Now look ahead and return to normal state if we don't see any HTML tokens
|
|
// ahead.
|
|
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
|
|
if (BufferPtr == CommentEnd) {
|
|
return;
|
|
}
|
|
|
|
C = *BufferPtr;
|
|
if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
|
|
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
|
|
State = LS_Normal;
|
|
return;
|
|
}
|
|
}
|
|
|
|
void Lexer::setupAndLexHTMLEndTag(Token &T) {
|
|
assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
|
|
|
|
const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
|
|
const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
|
|
StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
|
|
if (!isHTMLTagName(Name)) {
|
|
formTextToken(T, TagNameEnd);
|
|
return;
|
|
}
|
|
|
|
const char *End = skipWhitespace(TagNameEnd, CommentEnd);
|
|
|
|
formTokenWithChars(T, End, tok::html_end_tag);
|
|
T.setHTMLTagEndName(Name);
|
|
|
|
if (BufferPtr != CommentEnd && *BufferPtr == '>')
|
|
State = LS_HTMLEndTag;
|
|
}
|
|
|
|
void Lexer::lexHTMLEndTag(Token &T) {
|
|
assert(BufferPtr != CommentEnd && *BufferPtr == '>');
|
|
|
|
formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
|
|
State = LS_Normal;
|
|
}
|
|
|
|
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
|
|
const CommandTraits &Traits, SourceLocation FileLoc,
|
|
const char *BufferStart, const char *BufferEnd, bool ParseCommands)
|
|
: Allocator(Allocator), Diags(Diags), Traits(Traits),
|
|
BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
|
|
FileLoc(FileLoc), ParseCommands(ParseCommands),
|
|
CommentState(LCS_BeforeComment), State(LS_Normal) {}
|
|
|
|
void Lexer::lex(Token &T) {
|
|
again:
|
|
switch (CommentState) {
|
|
case LCS_BeforeComment:
|
|
if (BufferPtr == BufferEnd) {
|
|
formTokenWithChars(T, BufferPtr, tok::eof);
|
|
return;
|
|
}
|
|
|
|
assert(*BufferPtr == '/');
|
|
BufferPtr++; // Skip first slash.
|
|
switch(*BufferPtr) {
|
|
case '/': { // BCPL comment.
|
|
BufferPtr++; // Skip second slash.
|
|
|
|
if (BufferPtr != BufferEnd) {
|
|
// Skip Doxygen magic marker, if it is present.
|
|
// It might be missing because of a typo //< or /*<, or because we
|
|
// merged this non-Doxygen comment into a bunch of Doxygen comments
|
|
// around it: /** ... */ /* ... */ /** ... */
|
|
const char C = *BufferPtr;
|
|
if (C == '/' || C == '!')
|
|
BufferPtr++;
|
|
}
|
|
|
|
// Skip less-than symbol that marks trailing comments.
|
|
// Skip it even if the comment is not a Doxygen one, because //< and /*<
|
|
// are frequent typos.
|
|
if (BufferPtr != BufferEnd && *BufferPtr == '<')
|
|
BufferPtr++;
|
|
|
|
CommentState = LCS_InsideBCPLComment;
|
|
switch (State) {
|
|
case LS_VerbatimBlockFirstLine:
|
|
case LS_VerbatimBlockBody:
|
|
break;
|
|
case LS_HTMLStartTag:
|
|
BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
|
|
break;
|
|
default:
|
|
State = LS_Normal;
|
|
break;
|
|
}
|
|
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
|
|
goto again;
|
|
}
|
|
case '*': { // C comment.
|
|
BufferPtr++; // Skip star.
|
|
|
|
// Skip Doxygen magic marker.
|
|
const char C = *BufferPtr;
|
|
if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
|
|
BufferPtr++;
|
|
|
|
// Skip less-than symbol that marks trailing comments.
|
|
if (BufferPtr != BufferEnd && *BufferPtr == '<')
|
|
BufferPtr++;
|
|
|
|
CommentState = LCS_InsideCComment;
|
|
State = LS_Normal;
|
|
CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
|
|
goto again;
|
|
}
|
|
default:
|
|
llvm_unreachable("second character of comment should be '/' or '*'");
|
|
}
|
|
|
|
case LCS_BetweenComments: {
|
|
// Consecutive comments are extracted only if there is only whitespace
|
|
// between them. So we can search for the start of the next comment.
|
|
const char *EndWhitespace = BufferPtr;
|
|
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
|
|
EndWhitespace++;
|
|
|
|
// When lexing the start of an HTML tag (i.e. going through the attributes)
|
|
// there won't be any newlines generated.
|
|
if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
|
|
CommentState = LCS_BeforeComment;
|
|
BufferPtr = EndWhitespace;
|
|
goto again;
|
|
}
|
|
|
|
// Turn any whitespace between comments (and there is only whitespace
|
|
// between them -- guaranteed by comment extraction) into a newline. We
|
|
// have two newlines between C comments in total (first one was synthesized
|
|
// after a comment).
|
|
formTokenWithChars(T, EndWhitespace, tok::newline);
|
|
|
|
CommentState = LCS_BeforeComment;
|
|
break;
|
|
}
|
|
|
|
case LCS_InsideBCPLComment:
|
|
case LCS_InsideCComment:
|
|
if (BufferPtr != CommentEnd) {
|
|
lexCommentText(T);
|
|
break;
|
|
} else {
|
|
// Skip C comment closing sequence.
|
|
if (CommentState == LCS_InsideCComment) {
|
|
assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
|
|
BufferPtr += 2;
|
|
assert(BufferPtr <= BufferEnd);
|
|
|
|
// When lexing the start of an HTML tag (i.e. going through the
|
|
// attributes) there won't be any newlines generated - whitespace still
|
|
// needs to be skipped.
|
|
if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
|
|
CommentState = LCS_BetweenComments;
|
|
goto again;
|
|
}
|
|
|
|
// Synthenize newline just after the C comment, regardless if there is
|
|
// actually a newline.
|
|
formTokenWithChars(T, BufferPtr, tok::newline);
|
|
|
|
CommentState = LCS_BetweenComments;
|
|
break;
|
|
} else {
|
|
// Don't synthesized a newline after BCPL comment.
|
|
CommentState = LCS_BetweenComments;
|
|
goto again;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
StringRef Lexer::getSpelling(const Token &Tok,
|
|
const SourceManager &SourceMgr) const {
|
|
SourceLocation Loc = Tok.getLocation();
|
|
std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
|
|
|
|
bool InvalidTemp = false;
|
|
StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
|
|
if (InvalidTemp)
|
|
return StringRef();
|
|
|
|
const char *Begin = File.data() + LocInfo.second;
|
|
return StringRef(Begin, Tok.getLength());
|
|
}
|
|
|
|
} // end namespace comments
|
|
} // end namespace clang
|