llvm-project/clang/lib/AST/CommentLexer.cpp
Aaron Puchert 196554d42d Comment parsing: Complete list of Doxygen commands
These should be all the commands from [1] except those that are marked
obsolete, and "link" / "endlink", as that conflicts with the existing
HeaderDoc pair "link / "/link". For some commands we don't have the
ideal category, but it should work good enough for most cases.

There seems to be no existing test for most commands (except the ones
interpreted by -Wdocumentation), and to some extent such a test wouldn't
look very interesting. But I added a test for the correct parsing of
formulas, as they're a bit special. And I had to adapt
comment-lots-of-unknown-commands.c because typo correction was kicking
in and recognizing some of the commands.

This should fix a couple of reported bugs: PR17437, PR19581, PR24062
(partially, no diagnostic for matching cond/endcond), PR32909, PR37813,
PR44243 (partially, email@domain.com must be addressed separately).

[1] https://www.doxygen.nl/manual/commands.html

Reviewed By: gribozavr2

Differential Revision: https://reviews.llvm.org/D111190
2021-11-09 18:35:26 +01:00

868 lines
25 KiB
C++

//===--- CommentLexer.cpp -------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/AST/CommentLexer.h"
#include "clang/AST/CommentCommandTraits.h"
#include "clang/AST/CommentDiagnostic.h"
#include "clang/Basic/CharInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/ErrorHandling.h"
namespace clang {
namespace comments {
void Token::dump(const Lexer &L, const SourceManager &SM) const {
llvm::errs() << "comments::Token Kind=" << Kind << " ";
Loc.print(llvm::errs(), SM);
llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
}
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
return isLetter(C);
}
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
return isDigit(C);
}
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
return isHexDigit(C);
}
static inline StringRef convertCodePointToUTF8(
llvm::BumpPtrAllocator &Allocator,
unsigned CodePoint) {
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
return StringRef(Resolved, ResolvedPtr - Resolved);
else
return StringRef();
}
namespace {
#include "clang/AST/CommentHTMLTags.inc"
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
} // end anonymous namespace
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
// Fast path, first check a few most widely used named character references.
return llvm::StringSwitch<StringRef>(Name)
.Case("amp", "&")
.Case("lt", "<")
.Case("gt", ">")
.Case("quot", "\"")
.Case("apos", "\'")
// Slow path.
.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
}
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
CodePoint *= 10;
CodePoint += Name[i] - '0';
}
return convertCodePointToUTF8(Allocator, CodePoint);
}
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
CodePoint *= 16;
const char C = Name[i];
assert(isHTMLHexCharacterReferenceCharacter(C));
CodePoint += llvm::hexDigitValue(C);
}
return convertCodePointToUTF8(Allocator, CodePoint);
}
void Lexer::skipLineStartingDecorations() {
// This function should be called only for C comments
assert(CommentState == LCS_InsideCComment);
if (BufferPtr == CommentEnd)
return;
switch (*BufferPtr) {
case ' ':
case '\t':
case '\f':
case '\v': {
const char *NewBufferPtr = BufferPtr;
NewBufferPtr++;
if (NewBufferPtr == CommentEnd)
return;
char C = *NewBufferPtr;
while (isHorizontalWhitespace(C)) {
NewBufferPtr++;
if (NewBufferPtr == CommentEnd)
return;
C = *NewBufferPtr;
}
if (C == '*')
BufferPtr = NewBufferPtr + 1;
break;
}
case '*':
BufferPtr++;
break;
}
}
namespace {
/// Returns pointer to the first newline character in the string.
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (isVerticalWhitespace(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
if (BufferPtr == BufferEnd)
return BufferPtr;
if (*BufferPtr == '\n')
BufferPtr++;
else {
assert(*BufferPtr == '\r');
BufferPtr++;
if (BufferPtr != BufferEnd && *BufferPtr == '\n')
BufferPtr++;
}
return BufferPtr;
}
const char *skipNamedCharacterReference(const char *BufferPtr,
const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
const char *skipDecimalCharacterReference(const char *BufferPtr,
const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
const char *skipHexCharacterReference(const char *BufferPtr,
const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
bool isHTMLIdentifierStartingCharacter(char C) {
return isLetter(C);
}
bool isHTMLIdentifierCharacter(char C) {
return isAlphanumeric(C);
}
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHTMLIdentifierCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
/// string allowed.
///
/// Returns pointer to closing quote.
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
{
const char Quote = *BufferPtr;
assert(Quote == '\"' || Quote == '\'');
BufferPtr++;
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
const char C = *BufferPtr;
if (C == Quote && BufferPtr[-1] != '\\')
return BufferPtr;
}
return BufferEnd;
}
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isWhitespace(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
}
bool isCommandNameStartCharacter(char C) {
return isLetter(C);
}
bool isCommandNameCharacter(char C) {
return isAlphanumeric(C);
}
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isCommandNameCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
/// Return the one past end pointer for BCPL comments.
/// Handles newlines escaped with backslash or trigraph for backslahs.
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
const char *CurPtr = BufferPtr;
while (CurPtr != BufferEnd) {
while (!isVerticalWhitespace(*CurPtr)) {
CurPtr++;
if (CurPtr == BufferEnd)
return BufferEnd;
}
// We found a newline, check if it is escaped.
const char *EscapePtr = CurPtr - 1;
while(isHorizontalWhitespace(*EscapePtr))
EscapePtr--;
if (*EscapePtr == '\\' ||
(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
// We found an escaped newline.
CurPtr = skipNewline(CurPtr, BufferEnd);
} else
return CurPtr; // Not an escaped newline.
}
return BufferEnd;
}
/// Return the one past end pointer for C comments.
/// Very dumb, does not handle escaped newlines or trigraphs.
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (*BufferPtr == '*') {
assert(BufferPtr + 1 != BufferEnd);
if (*(BufferPtr + 1) == '/')
return BufferPtr;
}
}
llvm_unreachable("buffer end hit before '*/' was seen");
}
} // end anonymous namespace
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {
const unsigned TokLen = TokEnd - BufferPtr;
Result.setLocation(getSourceLocation(BufferPtr));
Result.setKind(Kind);
Result.setLength(TokLen);
#ifndef NDEBUG
Result.TextPtr = "<UNSET>";
Result.IntVal = 7;
#endif
BufferPtr = TokEnd;
}
void Lexer::lexCommentText(Token &T) {
assert(CommentState == LCS_InsideBCPLComment ||
CommentState == LCS_InsideCComment);
// Handles lexing non-command text, i.e. text and newline.
auto HandleNonCommandToken = [&]() -> void {
assert(State == LS_Normal);
const char *TokenPtr = BufferPtr;
assert(TokenPtr < CommentEnd);
switch (*TokenPtr) {
case '\n':
case '\r':
TokenPtr = skipNewline(TokenPtr, CommentEnd);
formTokenWithChars(T, TokenPtr, tok::newline);
if (CommentState == LCS_InsideCComment)
skipLineStartingDecorations();
return;
default: {
StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
.find_first_of(TokStartSymbols);
if (End != StringRef::npos)
TokenPtr += End;
else
TokenPtr = CommentEnd;
formTextToken(T, TokenPtr);
return;
}
}
};
if (!ParseCommands)
return HandleNonCommandToken();
switch (State) {
case LS_Normal:
break;
case LS_VerbatimBlockFirstLine:
lexVerbatimBlockFirstLine(T);
return;
case LS_VerbatimBlockBody:
lexVerbatimBlockBody(T);
return;
case LS_VerbatimLineText:
lexVerbatimLineText(T);
return;
case LS_HTMLStartTag:
lexHTMLStartTag(T);
return;
case LS_HTMLEndTag:
lexHTMLEndTag(T);
return;
}
assert(State == LS_Normal);
const char *TokenPtr = BufferPtr;
assert(TokenPtr < CommentEnd);
switch(*TokenPtr) {
case '\\':
case '@': {
// Commands that start with a backslash and commands that start with
// 'at' have equivalent semantics. But we keep information about the
// exact syntax in AST for comments.
tok::TokenKind CommandKind =
(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTextToken(T, TokenPtr);
return;
}
char C = *TokenPtr;
switch (C) {
default:
break;
case '\\': case '@': case '&': case '$':
case '#': case '<': case '>': case '%':
case '\"': case '.': case ':':
// This is one of \\ \@ \& \$ etc escape sequences.
TokenPtr++;
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
// This is the \:: escape sequence.
TokenPtr++;
}
StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(UnescapedText);
return;
}
// Don't make zero-length commands.
if (!isCommandNameStartCharacter(*TokenPtr)) {
formTextToken(T, TokenPtr);
return;
}
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
unsigned Length = TokenPtr - (BufferPtr + 1);
// Hardcoded support for lexing LaTeX formula commands
// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
C = *TokenPtr;
if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
C == '{' || C == '}') {
TokenPtr++;
Length++;
}
}
StringRef CommandName(BufferPtr + 1, Length);
const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
if (!Info) {
if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
StringRef CorrectedName = Info->Name;
SourceLocation Loc = getSourceLocation(BufferPtr);
SourceLocation EndLoc = getSourceLocation(TokenPtr);
SourceRange FullRange = SourceRange(Loc, EndLoc);
SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
Diag(Loc, diag::warn_correct_comment_command_name)
<< FullRange << CommandName << CorrectedName
<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
} else {
formTokenWithChars(T, TokenPtr, tok::unknown_command);
T.setUnknownCommandName(CommandName);
Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
<< SourceRange(T.getLocation(), T.getEndLocation());
return;
}
}
if (Info->IsVerbatimBlockCommand) {
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
return;
}
if (Info->IsVerbatimLineCommand) {
setupAndLexVerbatimLine(T, TokenPtr, Info);
return;
}
formTokenWithChars(T, TokenPtr, CommandKind);
T.setCommandID(Info->getID());
return;
}
case '&':
lexHTMLCharacterReference(T);
return;
case '<': {
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTextToken(T, TokenPtr);
return;
}
const char C = *TokenPtr;
if (isHTMLIdentifierStartingCharacter(C))
setupAndLexHTMLStartTag(T);
else if (C == '/')
setupAndLexHTMLEndTag(T);
else
formTextToken(T, TokenPtr);
return;
}
default:
return HandleNonCommandToken();
}
}
void Lexer::setupAndLexVerbatimBlock(Token &T,
const char *TextBegin,
char Marker, const CommandInfo *Info) {
assert(Info->IsVerbatimBlockCommand);
VerbatimBlockEndCommandName.clear();
VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
VerbatimBlockEndCommandName.append(Info->EndCommandName);
formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
T.setVerbatimBlockID(Info->getID());
// If there is a newline following the verbatim opening command, skip the
// newline so that we don't create an tok::verbatim_block_line with empty
// text content.
if (BufferPtr != CommentEnd &&
isVerticalWhitespace(*BufferPtr)) {
BufferPtr = skipNewline(BufferPtr, CommentEnd);
State = LS_VerbatimBlockBody;
return;
}
State = LS_VerbatimBlockFirstLine;
}
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
again:
assert(BufferPtr < CommentEnd);
// FIXME: It would be better to scan the text once, finding either the block
// end command or newline.
//
// Extract current line.
const char *Newline = findNewline(BufferPtr, CommentEnd);
StringRef Line(BufferPtr, Newline - BufferPtr);
// Look for end command in current line.
size_t Pos = Line.find(VerbatimBlockEndCommandName);
const char *TextEnd;
const char *NextLine;
if (Pos == StringRef::npos) {
// Current line is completely verbatim.
TextEnd = Newline;
NextLine = skipNewline(Newline, CommentEnd);
} else if (Pos == 0) {
// Current line contains just an end command.
const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
formTokenWithChars(T, End, tok::verbatim_block_end);
T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
State = LS_Normal;
return;
} else {
// There is some text, followed by end command. Extract text first.
TextEnd = BufferPtr + Pos;
NextLine = TextEnd;
// If there is only whitespace before end command, skip whitespace.
if (isWhitespace(BufferPtr, TextEnd)) {
BufferPtr = TextEnd;
goto again;
}
}
StringRef Text(BufferPtr, TextEnd - BufferPtr);
formTokenWithChars(T, NextLine, tok::verbatim_block_line);
T.setVerbatimBlockText(Text);
State = LS_VerbatimBlockBody;
}
void Lexer::lexVerbatimBlockBody(Token &T) {
assert(State == LS_VerbatimBlockBody);
if (CommentState == LCS_InsideCComment)
skipLineStartingDecorations();
if (BufferPtr == CommentEnd) {
formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
T.setVerbatimBlockText("");
return;
}
lexVerbatimBlockFirstLine(T);
}
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
const CommandInfo *Info) {
assert(Info->IsVerbatimLineCommand);
formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
T.setVerbatimLineID(Info->getID());
State = LS_VerbatimLineText;
}
void Lexer::lexVerbatimLineText(Token &T) {
assert(State == LS_VerbatimLineText);
// Extract current line.
const char *Newline = findNewline(BufferPtr, CommentEnd);
StringRef Text(BufferPtr, Newline - BufferPtr);
formTokenWithChars(T, Newline, tok::verbatim_line_text);
T.setVerbatimLineText(Text);
State = LS_Normal;
}
void Lexer::lexHTMLCharacterReference(Token &T) {
const char *TokenPtr = BufferPtr;
assert(*TokenPtr == '&');
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTextToken(T, TokenPtr);
return;
}
const char *NamePtr;
bool isNamed = false;
bool isDecimal = false;
char C = *TokenPtr;
if (isHTMLNamedCharacterReferenceCharacter(C)) {
NamePtr = TokenPtr;
TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
isNamed = true;
} else if (C == '#') {
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTextToken(T, TokenPtr);
return;
}
C = *TokenPtr;
if (isHTMLDecimalCharacterReferenceCharacter(C)) {
NamePtr = TokenPtr;
TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
isDecimal = true;
} else if (C == 'x' || C == 'X') {
TokenPtr++;
NamePtr = TokenPtr;
TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
} else {
formTextToken(T, TokenPtr);
return;
}
} else {
formTextToken(T, TokenPtr);
return;
}
if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
*TokenPtr != ';') {
formTextToken(T, TokenPtr);
return;
}
StringRef Name(NamePtr, TokenPtr - NamePtr);
TokenPtr++; // Skip semicolon.
StringRef Resolved;
if (isNamed)
Resolved = resolveHTMLNamedCharacterReference(Name);
else if (isDecimal)
Resolved = resolveHTMLDecimalCharacterReference(Name);
else
Resolved = resolveHTMLHexCharacterReference(Name);
if (Resolved.empty()) {
formTextToken(T, TokenPtr);
return;
}
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(Resolved);
}
void Lexer::setupAndLexHTMLStartTag(Token &T) {
assert(BufferPtr[0] == '<' &&
isHTMLIdentifierStartingCharacter(BufferPtr[1]));
const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
if (!isHTMLTagName(Name)) {
formTextToken(T, TagNameEnd);
return;
}
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
T.setHTMLTagStartName(Name);
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
const char C = *BufferPtr;
if (BufferPtr != CommentEnd &&
(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
State = LS_HTMLStartTag;
}
void Lexer::lexHTMLStartTag(Token &T) {
assert(State == LS_HTMLStartTag);
const char *TokenPtr = BufferPtr;
char C = *TokenPtr;
if (isHTMLIdentifierCharacter(C)) {
TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
formTokenWithChars(T, TokenPtr, tok::html_ident);
T.setHTMLIdent(Ident);
} else {
switch (C) {
case '=':
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_equals);
break;
case '\"':
case '\'': {
const char *OpenQuote = TokenPtr;
TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
const char *ClosingQuote = TokenPtr;
if (TokenPtr != CommentEnd) // Skip closing quote.
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
T.setHTMLQuotedString(StringRef(OpenQuote + 1,
ClosingQuote - (OpenQuote + 1)));
break;
}
case '>':
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_greater);
State = LS_Normal;
return;
case '/':
TokenPtr++;
if (TokenPtr != CommentEnd && *TokenPtr == '>') {
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
} else
formTextToken(T, TokenPtr);
State = LS_Normal;
return;
}
}
// Now look ahead and return to normal state if we don't see any HTML tokens
// ahead.
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) {
State = LS_Normal;
return;
}
C = *BufferPtr;
if (!isHTMLIdentifierStartingCharacter(C) &&
C != '=' && C != '\"' && C != '\'' && C != '>') {
State = LS_Normal;
return;
}
}
void Lexer::setupAndLexHTMLEndTag(Token &T) {
assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
if (!isHTMLTagName(Name)) {
formTextToken(T, TagNameEnd);
return;
}
const char *End = skipWhitespace(TagNameEnd, CommentEnd);
formTokenWithChars(T, End, tok::html_end_tag);
T.setHTMLTagEndName(Name);
if (BufferPtr != CommentEnd && *BufferPtr == '>')
State = LS_HTMLEndTag;
}
void Lexer::lexHTMLEndTag(Token &T) {
assert(BufferPtr != CommentEnd && *BufferPtr == '>');
formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
State = LS_Normal;
}
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
const CommandTraits &Traits, SourceLocation FileLoc,
const char *BufferStart, const char *BufferEnd, bool ParseCommands)
: Allocator(Allocator), Diags(Diags), Traits(Traits),
BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
FileLoc(FileLoc), ParseCommands(ParseCommands),
CommentState(LCS_BeforeComment), State(LS_Normal) {}
void Lexer::lex(Token &T) {
again:
switch (CommentState) {
case LCS_BeforeComment:
if (BufferPtr == BufferEnd) {
formTokenWithChars(T, BufferPtr, tok::eof);
return;
}
assert(*BufferPtr == '/');
BufferPtr++; // Skip first slash.
switch(*BufferPtr) {
case '/': { // BCPL comment.
BufferPtr++; // Skip second slash.
if (BufferPtr != BufferEnd) {
// Skip Doxygen magic marker, if it is present.
// It might be missing because of a typo //< or /*<, or because we
// merged this non-Doxygen comment into a bunch of Doxygen comments
// around it: /** ... */ /* ... */ /** ... */
const char C = *BufferPtr;
if (C == '/' || C == '!')
BufferPtr++;
}
// Skip less-than symbol that marks trailing comments.
// Skip it even if the comment is not a Doxygen one, because //< and /*<
// are frequent typos.
if (BufferPtr != BufferEnd && *BufferPtr == '<')
BufferPtr++;
CommentState = LCS_InsideBCPLComment;
if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
State = LS_Normal;
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
goto again;
}
case '*': { // C comment.
BufferPtr++; // Skip star.
// Skip Doxygen magic marker.
const char C = *BufferPtr;
if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
BufferPtr++;
// Skip less-than symbol that marks trailing comments.
if (BufferPtr != BufferEnd && *BufferPtr == '<')
BufferPtr++;
CommentState = LCS_InsideCComment;
State = LS_Normal;
CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
goto again;
}
default:
llvm_unreachable("second character of comment should be '/' or '*'");
}
case LCS_BetweenComments: {
// Consecutive comments are extracted only if there is only whitespace
// between them. So we can search for the start of the next comment.
const char *EndWhitespace = BufferPtr;
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
EndWhitespace++;
// Turn any whitespace between comments (and there is only whitespace
// between them -- guaranteed by comment extraction) into a newline. We
// have two newlines between C comments in total (first one was synthesized
// after a comment).
formTokenWithChars(T, EndWhitespace, tok::newline);
CommentState = LCS_BeforeComment;
break;
}
case LCS_InsideBCPLComment:
case LCS_InsideCComment:
if (BufferPtr != CommentEnd) {
lexCommentText(T);
break;
} else {
// Skip C comment closing sequence.
if (CommentState == LCS_InsideCComment) {
assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
BufferPtr += 2;
assert(BufferPtr <= BufferEnd);
// Synthenize newline just after the C comment, regardless if there is
// actually a newline.
formTokenWithChars(T, BufferPtr, tok::newline);
CommentState = LCS_BetweenComments;
break;
} else {
// Don't synthesized a newline after BCPL comment.
CommentState = LCS_BetweenComments;
goto again;
}
}
}
}
StringRef Lexer::getSpelling(const Token &Tok,
const SourceManager &SourceMgr) const {
SourceLocation Loc = Tok.getLocation();
std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
bool InvalidTemp = false;
StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
if (InvalidTemp)
return StringRef();
const char *Begin = File.data() + LocInfo.second;
return StringRef(Begin, Tok.getLength());
}
} // end namespace comments
} // end namespace clang