//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// // // The LLVM Compiler Infrastructure // // This file was developed by Chris Lattner and is distributed under // the University of Illinois Open Source License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Implement the Lexer for TableGen. // //===----------------------------------------------------------------------===// #include "TGLexer.h" #include "Record.h" #include "llvm/Support/Streams.h" #include "Record.h" #include "llvm/Support/MemoryBuffer.h" typedef std::pair*> SubClassRefTy; #include "FileParser.h" #include using namespace llvm; // FIXME: REMOVE THIS. #define YYEOF 0 #define YYERROR -2 TGLexer::TGLexer(MemoryBuffer *StartBuf) : CurLineNo(1), CurBuf(StartBuf) { CurPtr = CurBuf->getBufferStart(); TokStart = 0; } TGLexer::~TGLexer() { while (!IncludeStack.empty()) { delete IncludeStack.back().Buffer; IncludeStack.pop_back(); } delete CurBuf; } /// ReturnError - Set the error to the specified string at the specified /// location. This is defined to always return YYERROR. int TGLexer::ReturnError(const char *Loc, const std::string &Msg) { PrintError(Loc, Msg); return YYERROR; } std::ostream &TGLexer::err() const { PrintIncludeStack(*cerr.stream()); return *cerr.stream(); } void TGLexer::PrintIncludeStack(std::ostream &OS) const { for (unsigned i = 0, e = IncludeStack.size(); i != e; ++i) OS << "Included from " << IncludeStack[i].Buffer->getBufferIdentifier() << ":" << IncludeStack[i].LineNo << ":\n"; OS << "Parsing " << CurBuf->getBufferIdentifier() << ":" << CurLineNo << ": "; } /// PrintError - Print the error at the specified location. void TGLexer::PrintError(const char *ErrorLoc, const std::string &Msg) const { err() << Msg << "\n"; assert(ErrorLoc && "Location not specified!"); // Scan backward to find the start of the line. const char *LineStart = ErrorLoc; while (LineStart != CurBuf->getBufferStart() && LineStart[-1] != '\n' && LineStart[-1] != '\r') --LineStart; // Get the end of the line. const char *LineEnd = ErrorLoc; while (LineEnd != CurBuf->getBufferEnd() && LineEnd[0] != '\n' && LineEnd[0] != '\r') ++LineEnd; // Print out the line. cerr << std::string(LineStart, LineEnd) << "\n"; // Print out spaces before the carat. for (const char *Pos = LineStart; Pos != ErrorLoc; ++Pos) cerr << (*Pos == '\t' ? '\t' : ' '); cerr << "^\n"; } int TGLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { default: return (unsigned char)CurChar; case 0: // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. if (CurPtr-1 != CurBuf->getBufferEnd()) return 0; // Just whitespace. // If this is the end of an included file, pop the parent file off the // include stack. if (!IncludeStack.empty()) { delete CurBuf; CurBuf = IncludeStack.back().Buffer; CurLineNo = IncludeStack.back().LineNo; CurPtr = IncludeStack.back().CurPtr; IncludeStack.pop_back(); return getNextChar(); } // Otherwise, return end of file. --CurPtr; // Another call to lex will return EOF again. return EOF; case '\n': case '\r': // Handle the newline character by ignoring it and incrementing the line // count. However, be careful about 'dos style' files with \n\r in them. // Only treat a \n\r or \r\n as a single line. if ((*CurPtr == '\n' || (*CurPtr == '\r')) && *CurPtr != CurChar) ++CurPtr; // Eat the two char newline sequence. ++CurLineNo; return '\n'; } } int TGLexer::LexToken() { TokStart = CurPtr; // This always consumes at least one character. int CurChar = getNextChar(); switch (CurChar) { default: // Handle letters: [a-zA-Z_] if (isalpha(CurChar) || CurChar == '_') return LexIdentifier(); // Unknown character, return the char itself. return (unsigned char)CurChar; case EOF: return YYEOF; case 0: case ' ': case '\t': case '\n': case '\r': // Ignore whitespace. return LexToken(); case '/': // If this is the start of a // comment, skip until the end of the line or // the end of the buffer. if (*CurPtr == '/') SkipBCPLComment(); else if (*CurPtr == '*') { if (SkipCComment()) return YYERROR; } else // Otherwise, return this / as a token. return CurChar; return LexToken(); case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexNumber(); case '"': return LexString(); case '$': return LexVarName(); case '[': return LexBracket(); case '!': return LexExclaim(); } } /// LexString - Lex "[^"]*" int TGLexer::LexString() { const char *StrStart = CurPtr; while (*CurPtr != '"') { // If we hit the end of the buffer, report an error. if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) return ReturnError(StrStart, "End of file in string literal"); if (*CurPtr == '\n' || *CurPtr == '\r') return ReturnError(StrStart, "End of line in string literal"); ++CurPtr; } Filelval.StrVal = new std::string(StrStart, CurPtr); ++CurPtr; return STRVAL; } int TGLexer::LexVarName() { if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') return '$'; // Invalid varname. // Otherwise, we're ok, consume the rest of the characters. const char *VarNameStart = CurPtr++; while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; Filelval.StrVal = new std::string(VarNameStart, CurPtr); return VARNAME; } int TGLexer::LexIdentifier() { // The first letter is [a-zA-Z_]. const char *IdentStart = CurPtr-1; // Match the rest of the identifier regex: [0-9a-zA-Z_]* while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; // Check to see if this identifier is a keyword. unsigned Len = CurPtr-IdentStart; if (Len == 3 && !memcmp(IdentStart, "int", 3)) return INT; if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return BIT; if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return BITS; if (Len == 6 && !memcmp(IdentStart, "string", 6)) return STRING; if (Len == 4 && !memcmp(IdentStart, "list", 4)) return LIST; if (Len == 4 && !memcmp(IdentStart, "code", 4)) return CODE; if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return DAG; if (Len == 5 && !memcmp(IdentStart, "class", 5)) return CLASS; if (Len == 3 && !memcmp(IdentStart, "def", 3)) return DEF; if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return DEFM; if (Len == 10 && !memcmp(IdentStart, "multiclass", 10)) return MULTICLASS; if (Len == 5 && !memcmp(IdentStart, "field", 5)) return FIELD; if (Len == 3 && !memcmp(IdentStart, "let", 3)) return LET; if (Len == 2 && !memcmp(IdentStart, "in", 2)) return IN; if (Len == 7 && !memcmp(IdentStart, "include", 7)) { if (LexInclude()) return YYERROR; return LexToken(); } Filelval.StrVal = new std::string(IdentStart, CurPtr); return ID; } /// LexInclude - We just read the "include" token. Get the string token that /// comes next and enter the include. bool TGLexer::LexInclude() { // The token after the include must be a string. int Tok = LexToken(); if (Tok == YYERROR) return true; if (Tok != STRVAL) { PrintError(getTokenStart(), "Expected filename after include"); return true; } // Get the string. std::string Filename = *Filelval.StrVal; delete Filelval.StrVal; // Try to find the file. MemoryBuffer *NewBuf = MemoryBuffer::getFile(&Filename[0], Filename.size()); // If the file didn't exist directly, see if it's in an include path. for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) { std::string IncFile = IncludeDirectories[i] + "/" + Filename; NewBuf = MemoryBuffer::getFile(&IncFile[0], IncFile.size()); } if (NewBuf == 0) { PrintError(getTokenStart(), "Could not find include file '" + Filename + "'"); return true; } // Save the line number and lex buffer of the includer. IncludeStack.push_back(IncludeRec(CurBuf, CurPtr, CurLineNo)); CurLineNo = 1; // Reset line numbering. CurBuf = NewBuf; CurPtr = CurBuf->getBufferStart(); return false; } void TGLexer::SkipBCPLComment() { ++CurPtr; // skip the second slash. while (1) { switch (*CurPtr) { case '\n': case '\r': return; // Newline is end of comment. case 0: // If this is the end of the buffer, end the comment. if (CurPtr == CurBuf->getBufferEnd()) return; break; } // Otherwise, skip the character. ++CurPtr; } } /// SkipCComment - This skips C-style /**/ comments. The only difference from C /// is that we allow nesting. bool TGLexer::SkipCComment() { const char *CommentStart = CurPtr-1; ++CurPtr; // skip the star. unsigned CommentDepth = 1; while (1) { int CurChar = getNextChar(); switch (CurChar) { case EOF: PrintError(CommentStart, "Unterminated comment!"); return true; case '*': // End of the comment? if (CurPtr[0] != '/') break; ++CurPtr; // End the */. if (--CommentDepth == 0) return false; break; case '/': // Start of a nested comment? if (CurPtr[0] != '*') break; ++CurPtr; ++CommentDepth; break; } } } /// LexNumber - Lex: /// [-+]?[0-9]+ /// 0x[0-9a-fA-F]+ /// 0b[01]+ int TGLexer::LexNumber() { const char *NumStart = CurPtr-1; if (CurPtr[-1] == '0') { if (CurPtr[0] == 'x') { ++CurPtr; NumStart = CurPtr; while (isxdigit(CurPtr[0])) ++CurPtr; // Requires at least one hex digit. if (CurPtr == NumStart) return ReturnError(CurPtr-2, "Invalid hexadecimal number"); Filelval.IntVal = strtol(NumStart, 0, 16); return INTVAL; } else if (CurPtr[0] == 'b') { ++CurPtr; NumStart = CurPtr; while (CurPtr[0] == '0' || CurPtr[0] == '1') ++CurPtr; // Requires at least one binary digit. if (CurPtr == NumStart) return ReturnError(CurPtr-2, "Invalid binary number"); Filelval.IntVal = strtol(NumStart, 0, 2); return INTVAL; } } // Check for a sign without a digit. if (CurPtr[-1] == '-' || CurPtr[-1] == '+') { if (!isdigit(CurPtr[0])) return CurPtr[-1]; } while (isdigit(CurPtr[0])) ++CurPtr; Filelval.IntVal = strtol(NumStart, 0, 10); return INTVAL; } /// LexBracket - We just read '['. If this is a code block, return it, /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' int TGLexer::LexBracket() { if (CurPtr[0] != '{') return '['; ++CurPtr; const char *CodeStart = CurPtr; while (1) { int Char = getNextChar(); if (Char == EOF) break; if (Char != '}') continue; Char = getNextChar(); if (Char == EOF) break; if (Char == ']') { Filelval.StrVal = new std::string(CodeStart, CurPtr-2); return CODEFRAGMENT; } } return ReturnError(CodeStart-2, "Unterminated Code Block"); } /// LexExclaim - Lex '!' and '![a-zA-Z]+'. int TGLexer::LexExclaim() { if (!isalpha(*CurPtr)) return '!'; const char *Start = CurPtr++; while (isalpha(*CurPtr)) ++CurPtr; // Check to see which operator this is. unsigned Len = CurPtr-Start; if (Len == 3 && !memcmp(Start, "con", 3)) return CONCATTOK; if (Len == 3 && !memcmp(Start, "sra", 3)) return SRATOK; if (Len == 3 && !memcmp(Start, "srl", 3)) return SRLTOK; if (Len == 3 && !memcmp(Start, "shl", 3)) return SHLTOK; if (Len == 9 && !memcmp(Start, "strconcat", 9)) return STRCONCATTOK; return ReturnError(Start-1, "Unknown operator"); } //===----------------------------------------------------------------------===// // Interfaces used by the Bison parser. //===----------------------------------------------------------------------===// int Fileparse(); static TGLexer *TheLexer; namespace llvm { std::ostream &err() { return TheLexer->err(); } /// ParseFile - this function begins the parsing of the specified tablegen /// file. /// void ParseFile(const std::string &Filename, const std::vector &IncludeDirs) { std::string ErrorStr; MemoryBuffer *F = MemoryBuffer::getFileOrSTDIN(&Filename[0], Filename.size(), &ErrorStr); if (F == 0) { cerr << "Could not open input file '" + Filename + "': " << ErrorStr <<"\n"; exit(1); } assert(!TheLexer && "Lexer isn't reentrant yet!"); TheLexer = new TGLexer(F); // Record the location of the include directory so that the lexer can find // it later. TheLexer->setIncludeDirs(IncludeDirs); Fileparse(); // Cleanup delete TheLexer; TheLexer = 0; } } // End llvm namespace int Filelex() { assert(TheLexer && "No lexer setup yet!"); int Tok = TheLexer->LexToken(); if (Tok == YYERROR) exit(1); return Tok; }