Implement a lexer for structured comments.

llvm-svn: 159223
This commit is contained in:
Dmitri Gribenko 2012-06-26 20:39:18 +00:00
parent add5e9e289
commit 5188c4b9cc
17 changed files with 2325 additions and 26 deletions

View File

@ -3200,6 +3200,12 @@ CINDEX_LINKAGE CXSourceRange clang_Cursor_getCommentRange(CXCursor C);
*/
CINDEX_LINKAGE CXString clang_Cursor_getRawCommentText(CXCursor C);
/**
* \brief Given a cursor that represents a declaration, return the associated
* \\brief paragraph; otherwise return the first paragraph.
*/
CINDEX_LINKAGE CXString clang_Cursor_getBriefCommentText(CXCursor C);
/**
* @}
*/

View File

@ -0,0 +1,49 @@
//===--- CommentBriefParser.h - Dumb comment parser -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines a very simple Doxygen comment parser.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H
#define LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H
#include "clang/AST/CommentLexer.h"
namespace clang {
namespace comments {
/// A very simple comment parser that extracts just the brief description or
/// first paragraph.
class BriefParser {
Lexer &L;
/// Current lookahead token.
Token Tok;
SourceLocation ConsumeToken() {
SourceLocation Loc = Tok.getLocation();
L.lex(Tok);
return Loc;
}
public:
BriefParser(Lexer &L);
/// Return \\brief paragraph, if it exists; otherwise return the first
/// paragraph.
std::string Parse();
};
} // end namespace comments
} // end namespace clang
#endif

View File

@ -0,0 +1,352 @@
//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines lexer for structured comments and supporting token class.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
#define LLVM_CLANG_AST_COMMENT_LEXER_H
#include "clang/Basic/SourceManager.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/raw_ostream.h"
namespace clang {
namespace comments {
class Lexer;
namespace tok {
enum TokenKind {
eof,
newline,
text,
command,
verbatim_block_begin,
verbatim_block_line,
verbatim_block_end,
verbatim_line,
html_tag_open, // <tag
html_ident, // attr
html_equals, // =
html_quoted_string, // "blah\"blah" or 'blah\'blah'
html_greater, // >
html_tag_close, // </tag>
// Markdown tokens (not supported yet).
ruler,
md_code_line, // Line indented at least by 4 spaces.
md_code_inline, // `code`
md_emph, // _text_ or *text*
md_strong, // __text__ or *text*
md_header // ### level 3 header ###
};
} // end namespace tok
class CommentOptions {
public:
bool Markdown;
};
/// \brief Comment token.
class Token {
friend class Lexer;
/// The location of the token.
SourceLocation Loc;
/// The actual kind of the token.
tok::TokenKind Kind;
/// Length of the token spelling in comment. Can be 0 for synthenized
/// tokens.
unsigned Length;
/// Contains text value associated with a token.
const char *TextPtr1;
unsigned TextLen1;
/// Contains text value associated with a token.
const char *TextPtr2;
unsigned TextLen2;
public:
SourceLocation getLocation() const LLVM_READONLY { return Loc; }
void setLocation(SourceLocation SL) { Loc = SL; }
tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
void setKind(tok::TokenKind K) { Kind = K; }
bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
unsigned getLength() const LLVM_READONLY { return Length; }
void setLength(unsigned L) { Length = L; }
StringRef getText() const LLVM_READONLY {
assert(is(tok::text));
return StringRef(TextPtr1, TextLen1);
}
void setText(StringRef Text) {
assert(is(tok::text));
TextPtr1 = Text.data();
TextLen1 = Text.size();
}
StringRef getCommandName() const LLVM_READONLY {
assert(is(tok::command));
return StringRef(TextPtr1, TextLen1);
}
void setCommandName(StringRef Name) {
assert(is(tok::command));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimBlockName() const LLVM_READONLY {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimBlockName(StringRef Name) {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimBlockText() const LLVM_READONLY {
assert(is(tok::verbatim_block_line));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimBlockText(StringRef Text) {
assert(is(tok::verbatim_block_line));
TextPtr1 = Text.data();
TextLen1 = Text.size();
}
/// Returns the name of verbatim line command.
StringRef getVerbatimLineName() const LLVM_READONLY {
assert(is(tok::verbatim_line));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimLineName(StringRef Name) {
assert(is(tok::verbatim_line));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimLineText() const LLVM_READONLY {
assert(is(tok::verbatim_line));
return StringRef(TextPtr2, TextLen2);
}
void setVerbatimLineText(StringRef Text) {
assert(is(tok::verbatim_line));
TextPtr2 = Text.data();
TextLen2 = Text.size();
}
StringRef getHTMLTagOpenName() const LLVM_READONLY {
assert(is(tok::html_tag_open));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLTagOpenName(StringRef Name) {
assert(is(tok::html_tag_open));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getHTMLIdent() const LLVM_READONLY {
assert(is(tok::html_ident));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLIdent(StringRef Name) {
assert(is(tok::html_ident));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getHTMLQuotedString() const LLVM_READONLY {
assert(is(tok::html_quoted_string));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLQuotedString(StringRef Str) {
assert(is(tok::html_quoted_string));
TextPtr1 = Str.data();
TextLen1 = Str.size();
}
StringRef getHTMLTagCloseName() const LLVM_READONLY {
assert(is(tok::html_tag_close));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLTagCloseName(StringRef Name) {
assert(is(tok::html_tag_close));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
void dump(const Lexer &L, const SourceManager &SM) const;
};
/// \brief Comment lexer.
class Lexer {
private:
Lexer(const Lexer&); // DO NOT IMPLEMENT
void operator=(const Lexer&); // DO NOT IMPLEMENT
const char *const BufferStart;
const char *const BufferEnd;
SourceLocation FileLoc;
CommentOptions CommOpts;
const char *BufferPtr;
/// One past end pointer for the current comment. For BCPL comments points
/// to newline or BufferEnd, for C comments points to star in '*/'.
const char *CommentEnd;
enum LexerCommentState {
LCS_BeforeComment,
LCS_InsideBCPLComment,
LCS_InsideCComment,
LCS_BetweenComments
};
/// Low-level lexer state, track if we are inside or outside of comment.
LexerCommentState CommentState;
enum LexerState {
/// Lexing normal comment text
LS_Normal,
/// Finished lexing verbatim block beginning command, will lex first body
/// line.
LS_VerbatimBlockFirstLine,
/// Lexing verbatim block body line-by-line, skipping line-starting
/// decorations.
LS_VerbatimBlockBody,
/// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
LS_HTMLOpenTag
};
/// Current lexing mode.
LexerState State;
/// A verbatim-like block command eats every character (except line starting
/// decorations) until matching end command is seen or comment end is hit.
struct VerbatimBlockCommand {
StringRef BeginName;
StringRef EndName;
};
typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
/// Registered verbatim-like block commands.
VerbatimBlockCommandVector VerbatimBlockCommands;
/// If State is LS_VerbatimBlock, contains the the name of verbatim end
/// command, including command marker.
SmallString<16> VerbatimBlockEndCommandName;
bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
/// A verbatim-like line command eats everything until a newline is seen or
/// comment end is hit.
struct VerbatimLineCommand {
StringRef Name;
};
typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
/// Registered verbatim-like line commands.
VerbatimLineCommandVector VerbatimLineCommands;
bool isVerbatimLineCommand(StringRef Name) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {
const unsigned TokLen = TokEnd - BufferPtr;
Result.setLocation(getSourceLocation(BufferPtr));
Result.setKind(Kind);
Result.setLength(TokLen);
#ifndef NDEBUG
Result.TextPtr1 = "<UNSET>";
Result.TextLen1 = 7;
Result.TextPtr2 = "<UNSET>";
Result.TextLen2 = 7;
#endif
BufferPtr = TokEnd;
}
SourceLocation getSourceLocation(const char *Loc) const {
assert(Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!");
const unsigned CharNo = Loc - BufferStart;
return FileLoc.getLocWithOffset(CharNo);
}
/// Eat string matching regexp \code \s*\* \endcode.
void skipLineStartingDecorations();
/// Lex stuff inside comments. CommentEnd should be set correctly.
void lexCommentText(Token &T);
void setupAndLexVerbatimBlock(Token &T,
const char *TextBegin,
char Marker, StringRef EndName);
void lexVerbatimBlockFirstLine(Token &T);
void lexVerbatimBlockBody(Token &T);
void lexVerbatimLine(Token &T, const char *TextBegin);
void setupAndLexHTMLOpenTag(Token &T);
void lexHTMLOpenTag(Token &T);
void lexHTMLCloseTag(Token &T);
public:
Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
const char *BufferStart, const char *BufferEnd);
void lex(Token &T);
StringRef getSpelling(const Token &Tok,
const SourceManager &SourceMgr,
bool *Invalid = NULL) const;
/// \brief Register a new verbatim block command.
void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
/// \brief Register a new verbatim line command.
void addVerbatimLineCommand(StringRef Name);
};
} // end namespace comments
} // end namespace clang
#endif

View File

@ -15,6 +15,7 @@
namespace clang {
class ASTContext;
class ASTReader;
class RawComment {
@ -27,7 +28,7 @@ public:
CK_BCPLExcl, ///< \code //! stuff \endcode
CK_JavaDoc, ///< \code /** stuff */ \endcode
CK_Qt, ///< \code /*! stuff */ \endcode, also used by HeaderDoc
CK_Merged ///< Two or more Doxygen comments merged together
CK_Merged ///< Two or more documentation comments merged together
};
RawComment() : Kind(CK_Invalid), IsAlmostTrailingComment(false) { }
@ -53,7 +54,7 @@ public:
/// \code /**< stuff */ \endcode
/// \code /*!< stuff */ \endcode
bool isTrailingComment() const LLVM_READONLY {
assert(isDoxygen());
assert(isDocumentation());
return IsTrailingComment;
}
@ -64,13 +65,13 @@ public:
return IsAlmostTrailingComment;
}
/// Returns true if this comment is not a Doxygen comment.
/// Returns true if this comment is not a documentation comment.
bool isOrdinary() const LLVM_READONLY {
return (Kind == CK_OrdinaryBCPL) || (Kind == CK_OrdinaryC);
}
/// Returns true if this comment any kind of a Doxygen comment.
bool isDoxygen() const LLVM_READONLY {
/// Returns true if this comment any kind of a documentation comment.
bool isDocumentation() const LLVM_READONLY {
return !isInvalid() && !isOrdinary();
}
@ -91,11 +92,21 @@ public:
unsigned getBeginLine(const SourceManager &SM) const;
unsigned getEndLine(const SourceManager &SM) const;
StringRef getBriefText(const ASTContext &Context) const {
if (BriefTextValid)
return BriefText;
return extractBriefText(Context);
}
private:
SourceRange Range;
mutable StringRef RawText;
mutable bool RawTextValid : 1; ///< True if RawText is valid
mutable StringRef BriefText;
mutable bool RawTextValid : 1; ///< True if RawText is valid
mutable bool BriefTextValid : 1; ///< True if BriefText is valid
unsigned Kind : 3;
@ -118,6 +129,8 @@ private:
StringRef getRawTextSlow(const SourceManager &SourceMgr) const;
StringRef extractBriefText(const ASTContext &Context) const;
friend class ASTReader;
};

View File

@ -90,7 +90,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
// First check whether we have a trailing comment.
if (Comment != RawComments.end() &&
Comment->isDoxygen() && Comment->isTrailingComment() &&
Comment->isDocumentation() && Comment->isTrailingComment() &&
!isa<TagDecl>(D) && !isa<NamespaceDecl>(D)) {
std::pair<FileID, unsigned> CommentBeginDecomp
= SourceMgr.getDecomposedLoc(Comment->getSourceRange().getBegin());
@ -111,7 +111,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
--Comment;
// Check that we actually have a non-member Doxygen comment.
if (!Comment->isDoxygen() || Comment->isTrailingComment())
if (!Comment->isDocumentation() || Comment->isTrailingComment())
return NULL;
// Decompose the end of the comment.

View File

@ -8,6 +8,8 @@ add_clang_library(clangAST
ASTImporter.cpp
AttrImpl.cpp
CXXInheritance.cpp
CommentBriefParser.cpp
CommentLexer.cpp
Decl.cpp
DeclarationName.cpp
DeclBase.cpp

View File

@ -0,0 +1,76 @@
//===--- CommentBriefParser.cpp - Dumb comment parser ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "clang/AST/CommentBriefParser.h"
namespace clang {
namespace comments {
std::string BriefParser::Parse() {
std::string FirstParagraph;
std::string Brief;
bool InFirstParagraph = true;
bool InBrief = false;
bool BriefDone = false;
while (Tok.isNot(tok::eof)) {
if (Tok.is(tok::text)) {
if (InFirstParagraph)
FirstParagraph += Tok.getText();
if (InBrief)
Brief += Tok.getText();
ConsumeToken();
continue;
}
if (!BriefDone && Tok.is(tok::command) && Tok.getCommandName() == "brief") {
InBrief = true;
ConsumeToken();
continue;
}
if (Tok.is(tok::newline)) {
if (InFirstParagraph)
FirstParagraph += '\n';
if (InBrief)
Brief += '\n';
ConsumeToken();
if (Tok.is(tok::newline)) {
ConsumeToken();
// We found a paragraph end.
InFirstParagraph = false;
if (InBrief) {
InBrief = false;
BriefDone = true;
}
}
continue;
}
// We didn't handle this token, so just drop it.
ConsumeToken();
}
if (Brief.size() > 0)
return Brief;
return FirstParagraph;
}
BriefParser::BriefParser(Lexer &L) : L(L)
{
// Get lookahead token.
ConsumeToken();
}
} // end namespace comments
} // end namespace clang

View File

@ -0,0 +1,676 @@
#include "clang/AST/CommentLexer.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ErrorHandling.h"
namespace clang {
namespace comments {
void Token::dump(const Lexer &L, const SourceManager &SM) const {
llvm::errs() << "comments::Token Kind=" << Kind << " ";
Loc.dump(SM);
llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
}
bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
StringRef &EndName) const {
const char *Result = llvm::StringSwitch<const char *>(BeginName)
.Case("code", "endcode")
.Case("verbatim", "endverbatim")
.Case("htmlonly", "endhtmlonly")
.Case("latexonly", "endlatexonly")
.Case("xmlonly", "endxmlonly")
.Case("manonly", "endmanonly")
.Case("rtfonly", "endrtfonly")
.Case("dot", "enddot")
.Case("msc", "endmsc")
.Case("f$", "f$") // Inline LaTeX formula
.Case("f[", "f]") // Displayed LaTeX formula
.Case("f{", "f}") // LaTeX environment
.Default(NULL);
if (Result) {
EndName = Result;
return true;
}
for (VerbatimBlockCommandVector::const_iterator
I = VerbatimBlockCommands.begin(),
E = VerbatimBlockCommands.end();
I != E; ++I)
if (I->BeginName == BeginName) {
EndName = I->EndName;
return true;
}
return false;
}
bool Lexer::isVerbatimLineCommand(StringRef Name) const {
bool Result = llvm::StringSwitch<bool>(Name)
.Case("fn", true)
.Case("var", true)
.Case("property", true)
.Case("typedef", true)
.Case("overload", true)
.Case("defgroup", true)
.Case("ingroup", true)
.Case("addtogroup", true)
.Case("weakgroup", true)
.Case("name", true)
.Case("section", true)
.Case("subsection", true)
.Case("subsubsection", true)
.Case("paragraph", true)
.Case("mainpage", true)
.Case("subpage", true)
.Case("ref", true)
.Default(false);
if (Result)
return true;
for (VerbatimLineCommandVector::const_iterator
I = VerbatimLineCommands.begin(),
E = VerbatimLineCommands.end();
I != E; ++I)
if (I->Name == Name)
return true;
return false;
}
void Lexer::skipLineStartingDecorations() {
// This function should be called only for C comments
assert(CommentState == LCS_InsideCComment);
if (BufferPtr == CommentEnd)
return;
switch (*BufferPtr) {
case ' ':
case '\t':
case '\f':
case '\v': {
const char *NewBufferPtr = BufferPtr;
NewBufferPtr++;
if (NewBufferPtr == CommentEnd)
return;
char C = *NewBufferPtr;
while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
NewBufferPtr++;
if (NewBufferPtr == CommentEnd)
return;
C = *NewBufferPtr;
}
if (C == '*')
BufferPtr = NewBufferPtr + 1;
break;
}
case '*':
BufferPtr++;
break;
}
}
namespace {
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
const char C = *BufferPtr;
if (C == '\n' || C == '\r')
return BufferPtr;
}
return BufferEnd;
}
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
if (BufferPtr == BufferEnd)
return BufferPtr;
if (*BufferPtr == '\n')
BufferPtr++;
else {
assert(*BufferPtr == '\r');
BufferPtr++;
if (BufferPtr != BufferEnd && *BufferPtr == '\n')
BufferPtr++;
}
return BufferPtr;
}
bool isHTMLIdentifierCharacter(char C) {
return (C >= 'a' && C <= 'z') ||
(C >= 'A' && C <= 'Z') ||
(C >= '0' && C <= '9');
}
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHTMLIdentifierCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
/// string allowed.
///
/// Returns pointer to closing quote.
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
{
const char Quote = *BufferPtr;
assert(Quote == '\"' || Quote == '\'');
BufferPtr++;
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
const char C = *BufferPtr;
if (C == Quote && BufferPtr[-1] != '\\')
return BufferPtr;
}
return BufferEnd;
}
bool isHorizontalWhitespace(char C) {
return C == ' ' || C == '\t' || C == '\f' || C == '\v';
}
bool isWhitespace(char C) {
return C == ' ' || C == '\n' || C == '\r' ||
C == '\t' || C == '\f' || C == '\v';
}
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isWhitespace(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
bool isCommandNameCharacter(char C) {
return (C >= 'a' && C <= 'z') ||
(C >= 'A' && C <= 'Z') ||
(C >= '0' && C <= '9');
}
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isCommandNameCharacter(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}
/// Return the one past end pointer for BCPL comments.
/// Handles newlines escaped with backslash or trigraph for backslahs.
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
const char *CurPtr = BufferPtr;
while (CurPtr != BufferEnd) {
char C = *CurPtr;
while (C != '\n' && C != '\r') {
CurPtr++;
if (CurPtr == BufferEnd)
return BufferEnd;
C = *CurPtr;
}
// We found a newline, check if it is escaped.
const char *EscapePtr = CurPtr - 1;
while(isHorizontalWhitespace(*EscapePtr))
EscapePtr--;
if (*EscapePtr == '\\' ||
(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
// We found an escaped newline.
CurPtr = skipNewline(CurPtr, BufferEnd);
} else
return CurPtr; // Not an escaped newline.
}
return BufferEnd;
}
/// Return the one past end pointer for C comments.
/// Very dumb, does not handle escaped newlines or trigraphs.
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
if (*BufferPtr == '*') {
assert(BufferPtr + 1 != BufferEnd);
if (*(BufferPtr + 1) == '/')
return BufferPtr;
}
}
llvm_unreachable("buffer end hit before '*/' was seen");
}
} // unnamed namespace
void Lexer::lexCommentText(Token &T) {
assert(CommentState == LCS_InsideBCPLComment ||
CommentState == LCS_InsideCComment);
switch (State) {
case LS_Normal:
break;
case LS_VerbatimBlockFirstLine:
lexVerbatimBlockFirstLine(T);
return;
case LS_VerbatimBlockBody:
lexVerbatimBlockBody(T);
return;
case LS_HTMLOpenTag:
lexHTMLOpenTag(T);
return;
}
assert(State == LS_Normal);
const char *TokenPtr = BufferPtr;
assert(TokenPtr < CommentEnd);
while (TokenPtr != CommentEnd) {
switch(*TokenPtr) {
case '\\':
case '@': {
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
return;
}
char C = *TokenPtr;
switch (C) {
default:
break;
case '\\': case '@': case '&': case '$':
case '#': case '<': case '>': case '%':
case '\"': case '.': case ':':
// This is one of \\ \@ \& \$ etc escape sequences.
TokenPtr++;
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
// This is the \:: escape sequence.
TokenPtr++;
}
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(StringRef(BufferPtr - (T.getLength() - 1),
T.getLength() - 1));
return;
}
// Don't make zero-length commands.
if (!isCommandNameCharacter(*TokenPtr)) {
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
return;
}
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
unsigned Length = TokenPtr - (BufferPtr + 1);
// Hardcoded support for lexing LaTeX formula commands
// \f$ \f[ \f] \f{ \f} as a single command.
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
C = *TokenPtr;
if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
TokenPtr++;
Length++;
}
}
const StringRef CommandName(BufferPtr + 1, Length);
StringRef EndName;
if (isVerbatimBlockCommand(CommandName, EndName)) {
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
return;
}
if (isVerbatimLineCommand(CommandName)) {
lexVerbatimLine(T, TokenPtr);
return;
}
formTokenWithChars(T, TokenPtr, tok::command);
T.setCommandName(CommandName);
return;
}
case '<': {
TokenPtr++;
if (TokenPtr == CommentEnd) {
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
return;
}
const char C = *TokenPtr;
if (isHTMLIdentifierCharacter(C))
setupAndLexHTMLOpenTag(T);
else if (C == '/')
lexHTMLCloseTag(T);
return;
}
case '\n':
case '\r':
TokenPtr = skipNewline(TokenPtr, CommentEnd);
formTokenWithChars(T, TokenPtr, tok::newline);
if (CommentState == LCS_InsideCComment)
skipLineStartingDecorations();
return;
default: {
while (true) {
TokenPtr++;
if (TokenPtr == CommentEnd)
break;
char C = *TokenPtr;
if(C == '\n' || C == '\r' ||
C == '\\' || C == '@' || C == '<')
break;
}
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
return;
}
}
}
}
void Lexer::setupAndLexVerbatimBlock(Token &T,
const char *TextBegin,
char Marker, StringRef EndName) {
VerbatimBlockEndCommandName.clear();
VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
VerbatimBlockEndCommandName.append(EndName);
formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
T.setVerbatimBlockName(StringRef(TextBegin - (T.getLength() - 1),
T.getLength() - 1));
State = LS_VerbatimBlockFirstLine;
}
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
assert(BufferPtr < CommentEnd);
// FIXME: It would be better to scan the text once, finding either the block
// end command or newline.
//
// Extract current line.
const char *Newline = findNewline(BufferPtr, CommentEnd);
StringRef Line(BufferPtr, Newline - BufferPtr);
// Look for end command in current line.
size_t Pos = Line.find(VerbatimBlockEndCommandName);
const char *NextLine;
if (Pos == StringRef::npos) {
// Current line is completely verbatim.
NextLine = skipNewline(Newline, CommentEnd);
} else if (Pos == 0) {
// Current line contains just an end command.
const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
formTokenWithChars(T, End, tok::verbatim_block_end);
T.setVerbatimBlockName(StringRef(End - (T.getLength() - 1),
T.getLength() - 1));
State = LS_Normal;
return;
} else {
// There is some text, followed by end command. Extract text first.
NextLine = BufferPtr + Pos;
}
formTokenWithChars(T, NextLine, tok::verbatim_block_line);
T.setVerbatimBlockText(StringRef(NextLine - T.getLength(), T.getLength()));
State = LS_VerbatimBlockBody;
}
void Lexer::lexVerbatimBlockBody(Token &T) {
assert(State == LS_VerbatimBlockBody);
if (CommentState == LCS_InsideCComment)
skipLineStartingDecorations();
lexVerbatimBlockFirstLine(T);
}
void Lexer::lexVerbatimLine(Token &T, const char *TextBegin) {
// Extract current line.
const char *Newline = findNewline(BufferPtr, CommentEnd);
const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
const StringRef Text(TextBegin, Newline - TextBegin);
formTokenWithChars(T, Newline, tok::verbatim_line);
T.setVerbatimLineName(Name);
T.setVerbatimLineText(Text);
}
void Lexer::setupAndLexHTMLOpenTag(Token &T) {
assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1]));
const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
T.setHTMLTagOpenName(StringRef(TagNameEnd - (T.getLength() - 1),
T.getLength() - 1));
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
if (BufferPtr != CommentEnd && *BufferPtr == '>') {
BufferPtr++;
return;
}
if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr))
State = LS_HTMLOpenTag;
}
void Lexer::lexHTMLOpenTag(Token &T) {
assert(State == LS_HTMLOpenTag);
const char *TokenPtr = BufferPtr;
char C = *TokenPtr;
if (isHTMLIdentifierCharacter(C)) {
TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
formTokenWithChars(T, TokenPtr, tok::html_ident);
T.setHTMLIdent(StringRef(TokenPtr - T.getLength(), T.getLength()));
} else {
switch (C) {
case '=':
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_equals);
break;
case '\"':
case '\'': {
const char *OpenQuote = TokenPtr;
TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
const char *ClosingQuote = TokenPtr;
if (TokenPtr != CommentEnd) // Skip closing quote.
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
T.setHTMLQuotedString(StringRef(OpenQuote + 1,
ClosingQuote - (OpenQuote + 1)));
break;
}
case '>':
TokenPtr++;
formTokenWithChars(T, TokenPtr, tok::html_greater);
break;
}
}
// Now look ahead and return to normal state if we don't see any HTML tokens
// ahead.
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) {
State = LS_Normal;
return;
}
C = *BufferPtr;
if (!isHTMLIdentifierCharacter(C) &&
C != '=' && C != '\"' && C != '\'' && C != '>') {
State = LS_Normal;
return;
}
}
void Lexer::lexHTMLCloseTag(Token &T) {
assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
const char *End = skipWhitespace(TagNameEnd, CommentEnd);
if (End != CommentEnd && *End == '>')
End++;
formTokenWithChars(T, End, tok::html_tag_close);
T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
}
Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
const char *BufferStart, const char *BufferEnd):
BufferStart(BufferStart), BufferEnd(BufferEnd),
FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
CommentState(LCS_BeforeComment), State(LS_Normal) {
}
void Lexer::lex(Token &T) {
again:
switch (CommentState) {
case LCS_BeforeComment:
if (BufferPtr == BufferEnd) {
formTokenWithChars(T, BufferPtr, tok::eof);
return;
}
assert(*BufferPtr == '/');
BufferPtr++; // Skip first slash.
switch(*BufferPtr) {
case '/': { // BCPL comment.
BufferPtr++; // Skip second slash.
if (BufferPtr != BufferEnd) {
// Skip Doxygen magic marker, if it is present.
// It might be missing because of a typo //< or /*<, or because we
// merged this non-Doxygen comment into a bunch of Doxygen comments
// around it: /** ... */ /* ... */ /** ... */
const char C = *BufferPtr;
if (C == '/' || C == '!')
BufferPtr++;
}
// Skip less-than symbol that marks trailing comments.
// Skip it even if the comment is not a Doxygen one, because //< and /*<
// are frequent typos.
if (BufferPtr != BufferEnd && *BufferPtr == '<')
BufferPtr++;
CommentState = LCS_InsideBCPLComment;
State = LS_Normal;
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
goto again;
}
case '*': { // C comment.
BufferPtr++; // Skip star.
// Skip Doxygen magic marker.
const char C = *BufferPtr;
if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
BufferPtr++;
// Skip less-than symbol that marks trailing comments.
if (BufferPtr != BufferEnd && *BufferPtr == '<')
BufferPtr++;
CommentState = LCS_InsideCComment;
State = LS_Normal;
CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
goto again;
}
default:
llvm_unreachable("second character of comment should be '/' or '*'");
}
case LCS_BetweenComments: {
// Consecutive comments are extracted only if there is only whitespace
// between them. So we can search for the start of the next comment.
const char *EndWhitespace = BufferPtr;
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
EndWhitespace++;
// Turn any whitespace between comments (and there is only whitespace
// between them) into a newline. We have two newlines between comments
// in total (first one was synthesized after a comment).
formTokenWithChars(T, EndWhitespace, tok::newline);
CommentState = LCS_BeforeComment;
break;
}
case LCS_InsideBCPLComment:
case LCS_InsideCComment:
if (BufferPtr != CommentEnd) {
lexCommentText(T);
break;
} else {
// Skip C comment closing sequence.
if (CommentState == LCS_InsideCComment) {
assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
BufferPtr += 2;
assert(BufferPtr <= BufferEnd);
// Synthenize newline just after the C comment, regardless if there is
// actually a newline.
formTokenWithChars(T, BufferPtr, tok::newline);
CommentState = LCS_BetweenComments;
break;
} else {
// Don't synthesized a newline after BCPL comment.
CommentState = LCS_BetweenComments;
goto again;
}
}
}
}
StringRef Lexer::getSpelling(const Token &Tok,
const SourceManager &SourceMgr,
bool *Invalid) const {
SourceLocation Loc = Tok.getLocation();
std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
bool InvalidTemp = false;
StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
if (InvalidTemp) {
*Invalid = true;
return StringRef();
}
const char *Begin = File.data() + LocInfo.second;
return StringRef(Begin, Tok.getLength());
}
void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
VerbatimBlockCommand VBC;
VBC.BeginName = BeginName;
VBC.EndName = EndName;
VerbatimBlockCommands.push_back(VBC);
}
void Lexer::addVerbatimLineCommand(StringRef Name) {
VerbatimLineCommand VLC;
VLC.Name = Name;
VerbatimLineCommands.push_back(VLC);
}
} // end namespace comments
} // end namespace clang

View File

@ -8,6 +8,9 @@
//===----------------------------------------------------------------------===//
#include "clang/AST/RawCommentList.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/CommentLexer.h"
#include "clang/AST/CommentBriefParser.h"
#include "llvm/ADT/STLExtras.h"
using namespace clang;
@ -126,6 +129,24 @@ StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
return StringRef(BufferStart + BeginOffset, Length);
}
StringRef RawComment::extractBriefText(const ASTContext &Context) const {
// Make sure that RawText is valid.
getRawText(Context.getSourceManager());
comments::Lexer L(Range.getBegin(), comments::CommentOptions(),
RawText.begin(), RawText.end());
comments::BriefParser P(L);
const std::string Result = P.Parse();
const unsigned BriefTextLength = Result.size();
char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
BriefText = StringRef(BriefTextPtr, BriefTextLength);
BriefTextValid = true;
return BriefText;
}
namespace {
bool containsOnlyWhitespace(StringRef Str) {
return Str.find_first_not_of(" \t\f\v\r\n") == StringRef::npos;

View File

@ -163,6 +163,37 @@ class test42 {
int isdoxy42; /* NOT_DOXYGEN */ ///< isdoxy42 IS_DOXYGEN_SINGLE
};
/// IS_DOXYGEN_START
/// It is fine to have a command at the end of comment.
///\brief
///
/// Some malformed command.
/* \*/
/**
* \brief Aaa aaaaaaa aaaa.
* IS_DOXYGEN_END
*/
void isdoxy43(void);
/// IS_DOXYGEN_START Aaa bbb
/// ccc.
///
/// Ddd eee.
/// Fff.
///
/// Ggg. IS_DOXYGEN_END
void isdoxy44(void);
/// IS_DOXYGEN_START Aaa bbb
/// ccc.
///
/// \brief
/// Ddd eee.
/// Fff.
///
/// Ggg. IS_DOXYGEN_END
void isdoxy45(void);
#endif
// RUN: rm -rf %t
@ -187,8 +218,8 @@ class test42 {
// WRONG-NOT: IS_DOXYGEN_NOT_ATTACHED
// Ensure we don't pick up extra comments.
// WRONG-NOT: IS_DOXYGEN_START{{.*}}IS_DOXYGEN_START
// WRONG-NOT: IS_DOXYGEN_END{{.*}}IS_DOXYGEN_END
// WRONG-NOT: IS_DOXYGEN_START{{.*}}IS_DOXYGEN_START{{.*}}BriefComment=
// WRONG-NOT: IS_DOXYGEN_END{{.*}}IS_DOXYGEN_END{{.*}}BriefComment=
// RUN: FileCheck %s < %t/out.c-index-direct
// RUN: FileCheck %s < %t/out.c-index-pch
@ -226,4 +257,8 @@ class test42 {
// CHECK: annotate-comments.cpp:155:6: FunctionDecl=isdoxy40:{{.*}} isdoxy40 IS_DOXYGEN_SINGLE
// CHECK: annotate-comments.cpp:160:5: FunctionDecl=isdoxy41:{{.*}} isdoxy41 IS_DOXYGEN_SINGLE
// CHECK: annotate-comments.cpp:163:7: FieldDecl=isdoxy42:{{.*}} isdoxy42 IS_DOXYGEN_SINGLE
// CHECK: annotate-comments.cpp:176:6: FunctionDecl=isdoxy43:{{.*}} IS_DOXYGEN_START{{.*}} IS_DOXYGEN_END
// CHECK: annotate-comments.cpp:185:6: FunctionDecl=isdoxy44:{{.*}} BriefComment=[ IS_DOXYGEN_START Aaa bbb\n ccc.\n]
// CHECK: annotate-comments.cpp:195:6: FunctionDecl=isdoxy45:{{.*}} BriefComment=[\n Ddd eee.\n Fff.\n]

View File

@ -162,6 +162,24 @@ int parse_remapped_files(int argc, const char **argv, int start_arg,
/* Pretty-printing. */
/******************************************************************************/
static void PrintCString(const char *Prefix, const char *CStr) {
printf(" %s=[", Prefix);
if (CStr != NULL && CStr[0] != '\0') {
for ( ; *CStr; ++CStr) {
const char C = *CStr;
switch (C) {
case '\n': printf("\\n"); break;
case '\r': printf("\\r"); break;
case '\t': printf("\\t"); break;
case '\v': printf("\\v"); break;
case '\f': printf("\\f"); break;
default: putchar(C); break;
}
}
}
printf("]");
}
static void PrintRange(CXSourceRange R, const char *str) {
CXFile begin_file, end_file;
unsigned begin_line, begin_column, end_line, end_column;
@ -218,8 +236,10 @@ static void PrintCursor(CXCursor Cursor) {
CXPlatformAvailability PlatformAvailability[2];
int NumPlatformAvailability;
int I;
CXString Comment;
const char *CommentCString;
CXString RawComment;
const char *RawCommentCString;
CXString BriefComment;
const char *BriefCommentCString;
ks = clang_getCursorKindSpelling(Cursor.kind);
string = want_display_name? clang_getCursorDisplayName(Cursor)
@ -401,21 +421,19 @@ static void PrintCursor(CXCursor Cursor) {
PrintRange(RefNameRange, "RefName");
}
Comment = clang_Cursor_getRawCommentText(Cursor);
CommentCString = clang_getCString(Comment);
if (CommentCString != NULL && CommentCString[0] != '\0') {
printf(" Comment=[");
for ( ; *CommentCString; ++CommentCString) {
if (*CommentCString != '\n')
putchar(*CommentCString);
else
printf("\\n");
}
printf("]");
RawComment = clang_Cursor_getRawCommentText(Cursor);
RawCommentCString = clang_getCString(RawComment);
if (RawCommentCString != NULL && RawCommentCString[0] != '\0') {
PrintCString("RawComment", RawCommentCString);
PrintRange(clang_Cursor_getCommentRange(Cursor), "RawCommentRange");
PrintRange(clang_Cursor_getCommentRange(Cursor), "CommentRange");
BriefComment = clang_Cursor_getBriefCommentText(Cursor);
BriefCommentCString = clang_getCString(BriefComment);
if (BriefCommentCString != NULL && BriefCommentCString[0] != '\0')
PrintCString("BriefComment", BriefCommentCString);
clang_disposeString(BriefComment);
}
clang_disposeString(Comment);
clang_disposeString(RawComment);
}
}

View File

@ -5707,6 +5707,24 @@ CXString clang_Cursor_getRawCommentText(CXCursor C) {
} // end: extern "C"
CXString clang_Cursor_getBriefCommentText(CXCursor C) {
if (!clang_isDeclaration(C.kind))
return createCXString((const char *) NULL);
const Decl *D = getCursorDecl(C);
const ASTContext &Context = getCursorContext(C);
const RawComment *RC = Context.getRawCommentForDecl(D);
if (RC && RC->isDocumentation()) {
StringRef BriefText = RC->getBriefText(Context);
// Don't duplicate the string because RawComment ensures that this memory
// will not go away.
return createCXString(BriefText, false);
}
return createCXString((const char *) NULL);
}
//===----------------------------------------------------------------------===//
// C++ AST instrospection.

View File

@ -5,6 +5,7 @@ clang_CXIndex_setGlobalOptions
clang_CXXMethod_isStatic
clang_CXXMethod_isVirtual
clang_Cursor_getArgument
clang_Cursor_getBriefCommentText
clang_Cursor_getCommentRange
clang_Cursor_getRawCommentText
clang_Cursor_getNumArguments

View File

@ -0,0 +1,7 @@
add_clang_unittest(ASTTests
CommentLexer.cpp
)
target_link_libraries(ASTTests
clangAST
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
##===- unittests/AST/Makefile ------------------------------*- Makefile -*-===##
#
# The LLVM Compiler Infrastructure
#
# This file is distributed under the University of Illinois Open Source
# License. See LICENSE.TXT for details.
#
##===----------------------------------------------------------------------===##
CLANG_LEVEL = ../..
TESTNAME = AST
LINK_COMPONENTS := support mc
USEDLIBS = clangAST.a clangBasic.a
include $(CLANG_LEVEL)/unittests/Makefile

View File

@ -14,7 +14,7 @@ ifndef CLANG_LEVEL
IS_UNITTEST_LEVEL := 1
CLANG_LEVEL := ..
PARALLEL_DIRS = Basic Frontend Lex Tooling
PARALLEL_DIRS = Basic AST Frontend Lex Tooling
endif # CLANG_LEVEL