mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-18 18:46:40 +00:00

The bug was that if we recover from the token 0, we will make the Heads empty (Line646), which results no recovery being applied. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D132388
790 lines
31 KiB
C++
790 lines
31 KiB
C++
//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "clang-pseudo/GLR.h"
|
|
#include "clang-pseudo/Bracket.h"
|
|
#include "clang-pseudo/Language.h"
|
|
#include "clang-pseudo/Token.h"
|
|
#include "clang-pseudo/grammar/Grammar.h"
|
|
#include "clang/Basic/LangOptions.h"
|
|
#include "clang/Basic/TokenKinds.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/FormatVariadic.h"
|
|
#include "gmock/gmock.h"
|
|
#include "gtest/gtest.h"
|
|
#include <memory>
|
|
|
|
namespace clang {
|
|
namespace pseudo {
|
|
|
|
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
|
|
const std::vector<const GSS::Node *> &Heads) {
|
|
for (const auto *Head : Heads)
|
|
OS << *Head << "\n";
|
|
return OS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using StateID = LRTable::StateID;
|
|
using testing::AllOf;
|
|
using testing::ElementsAre;
|
|
using testing::IsEmpty;
|
|
using testing::UnorderedElementsAre;
|
|
|
|
MATCHER_P(state, StateID, "") { return arg->State == StateID; }
|
|
MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
|
|
MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
|
|
MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
|
|
|
|
testing::Matcher<const GSS::Node *>
|
|
parents(llvm::ArrayRef<const GSS::Node *> Parents) {
|
|
return testing::Property(&GSS::Node::parents,
|
|
testing::UnorderedElementsAreArray(Parents));
|
|
}
|
|
|
|
Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
|
|
EXPECT_GT(Begin, 0u);
|
|
const Token &Left = Code.tokens()[Begin - 1];
|
|
EXPECT_EQ(Left.Kind, tok::l_brace);
|
|
if (const auto* Right = Left.pair()) {
|
|
EXPECT_EQ(Right->Kind, tok::r_brace);
|
|
return Code.index(*Right);
|
|
}
|
|
return Token::Invalid;
|
|
}
|
|
|
|
class GLRTest : public ::testing::Test {
|
|
public:
|
|
void build(llvm::StringRef GrammarBNF) {
|
|
std::vector<std::string> Diags;
|
|
TestLang.G = Grammar::parseBNF(GrammarBNF, Diags);
|
|
}
|
|
|
|
TokenStream emptyTokenStream() {
|
|
TokenStream Empty;
|
|
Empty.finalize();
|
|
return Empty;
|
|
}
|
|
|
|
void buildGrammar(std::vector<std::string> Nonterminals,
|
|
std::vector<std::string> Rules) {
|
|
Nonterminals.push_back("_");
|
|
llvm::sort(Nonterminals);
|
|
Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()),
|
|
Nonterminals.end());
|
|
std::string FakeTestBNF;
|
|
for (const auto &NT : Nonterminals)
|
|
FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT);
|
|
FakeTestBNF += llvm::join(Rules, "\n");
|
|
build(FakeTestBNF);
|
|
}
|
|
|
|
SymbolID id(llvm::StringRef Name) const {
|
|
for (unsigned I = 0; I < NumTerminals; ++I)
|
|
if (TestLang.G.table().Terminals[I] == Name)
|
|
return tokenSymbol(static_cast<tok::TokenKind>(I));
|
|
for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID)
|
|
if (TestLang.G.table().Nonterminals[ID].Name == Name)
|
|
return ID;
|
|
ADD_FAILURE() << "No such symbol found: " << Name;
|
|
return 0;
|
|
}
|
|
ExtensionID extensionID(llvm::StringRef AttrValueName) const {
|
|
for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size();
|
|
++EID)
|
|
if (TestLang.G.table().AttributeValues[EID] == AttrValueName)
|
|
return EID;
|
|
ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
|
|
return 0;
|
|
}
|
|
|
|
RuleID ruleFor(llvm::StringRef NonterminalName) const {
|
|
auto RuleRange =
|
|
TestLang.G.table().Nonterminals[id(NonterminalName)].RuleRange;
|
|
if (RuleRange.End - RuleRange.Start == 1)
|
|
return TestLang.G.table()
|
|
.Nonterminals[id(NonterminalName)]
|
|
.RuleRange.Start;
|
|
ADD_FAILURE() << "Expected a single rule for " << NonterminalName
|
|
<< ", but it has " << RuleRange.End - RuleRange.Start
|
|
<< " rule!\n";
|
|
return 0;
|
|
}
|
|
|
|
protected:
|
|
Language TestLang;
|
|
ForestArena Arena;
|
|
GSS GSStack;
|
|
};
|
|
|
|
TEST_F(GLRTest, ShiftMergingHeads) {
|
|
// Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
|
|
// 2 have shift actions to reach state 4, and the head 3 has a shift action to
|
|
// reach state 5:
|
|
// 0--1
|
|
// └--2
|
|
// └--3
|
|
// After the shift action, the GSS (with new heads 4, 5) is:
|
|
// 0---1---4
|
|
// └---2---┘
|
|
// └---3---5
|
|
auto *GSSNode0 =
|
|
GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
|
auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
|
|
/*Parents=*/{GSSNode0});
|
|
auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
|
|
/*Parents=*/{GSSNode0});
|
|
auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr,
|
|
/*Parents=*/{GSSNode0});
|
|
|
|
buildGrammar({}, {}); // Create a fake empty grammar.
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4};
|
|
B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4};
|
|
B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5};
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
|
|
std::vector<const GSS::Node *> NewHeads;
|
|
glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal,
|
|
{emptyTokenStream(), Arena, GSStack}, TestLang, NewHeads);
|
|
|
|
EXPECT_THAT(NewHeads,
|
|
UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal),
|
|
parents({GSSNode1, GSSNode2})),
|
|
AllOf(state(5), parsedSymbol(&SemiTerminal),
|
|
parents({GSSNode3}))))
|
|
<< NewHeads;
|
|
}
|
|
|
|
TEST_F(GLRTest, ReduceConflictsSplitting) {
|
|
// Before (splitting due to R/R conflict):
|
|
// 0--1(IDENTIFIER)
|
|
// After reducing 1 by `class-name := IDENTIFIER` and
|
|
// `enum-name := IDENTIFIER`:
|
|
// 0--2(class-name) // 2 is goto(0, class-name)
|
|
// └--3(enum-name) // 3 is goto(0, enum-name)
|
|
buildGrammar({"class-name", "enum-name"},
|
|
{"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{0}, id("class-name")}] = StateID{2};
|
|
B.Transition[{StateID{0}, id("enum-name")}] = StateID{3};
|
|
B.Reduce[StateID{1}].insert(ruleFor("class-name"));
|
|
B.Reduce[StateID{1}].insert(ruleFor("enum-name"));
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
const auto *GSSNode0 =
|
|
GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
|
const auto *GSSNode1 =
|
|
GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
|
|
|
|
std::vector<const GSS::Node *> Heads = {GSSNode1};
|
|
glrReduce(Heads, tokenSymbol(tok::eof),
|
|
{emptyTokenStream(), Arena, GSStack}, TestLang);
|
|
EXPECT_THAT(Heads, UnorderedElementsAre(
|
|
GSSNode1,
|
|
AllOf(state(2), parsedSymbolID(id("class-name")),
|
|
parents({GSSNode0})),
|
|
AllOf(state(3), parsedSymbolID(id("enum-name")),
|
|
parents({GSSNode0}))))
|
|
<< Heads;
|
|
}
|
|
|
|
TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
|
|
// Before (splitting due to multiple bases):
|
|
// 2(class-name)--4(*)
|
|
// 3(enum-name)---┘
|
|
// After reducing 4 by `ptr-operator := *`:
|
|
// 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
|
|
// 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
|
|
buildGrammar({"ptr-operator", "class-name", "enum-name"},
|
|
{"ptr-operator := *"});
|
|
|
|
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
|
|
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
|
|
|
|
const auto *GSSNode2 =
|
|
GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
|
|
const auto *GSSNode3 =
|
|
GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
|
|
const auto *GSSNode4 = GSStack.addNode(
|
|
/*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
|
|
/*Parents=*/{GSSNode2, GSSNode3});
|
|
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5};
|
|
B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6};
|
|
B.Reduce[StateID{4}].insert(ruleFor("ptr-operator"));
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
std::vector<const GSS::Node *> Heads = {GSSNode4};
|
|
glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
|
|
TestLang);
|
|
|
|
EXPECT_THAT(Heads, UnorderedElementsAre(
|
|
GSSNode4,
|
|
AllOf(state(5), parsedSymbolID(id("ptr-operator")),
|
|
parents({GSSNode2})),
|
|
AllOf(state(6), parsedSymbolID(id("ptr-operator")),
|
|
parents({GSSNode3}))))
|
|
<< Heads;
|
|
// Verify that the payload of the two new heads is shared, only a single
|
|
// ptr-operator node is created in the forest.
|
|
EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
|
|
}
|
|
|
|
TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
|
|
// Before (joining due to same goto state, multiple bases):
|
|
// 0--1(cv-qualifier)--3(class-name)
|
|
// └--2(cv-qualifier)--4(enum-name)
|
|
// After reducing 3 by `type-name := class-name` and
|
|
// 4 by `type-name := enum-name`:
|
|
// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
|
|
// └--2(cv-qualifier)--┘ // goto(2, type-name)
|
|
buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
|
|
{"type-name := class-name", "type-name := enum-name"});
|
|
|
|
auto *CVQualifierNode =
|
|
&Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
|
|
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
|
|
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
|
|
|
|
const auto *GSSNode0 =
|
|
GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
|
const auto *GSSNode1 = GSStack.addNode(
|
|
/*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
|
const auto *GSSNode2 = GSStack.addNode(
|
|
/*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
|
const auto *GSSNode3 = GSStack.addNode(
|
|
/*State=*/3, /*ForestNode=*/ClassNameNode,
|
|
/*Parents=*/{GSSNode1});
|
|
const auto *GSSNode4 =
|
|
GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
|
|
/*Parents=*/{GSSNode2});
|
|
|
|
// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{1}, id("type-name")}] = StateID{5};
|
|
B.Transition[{StateID{2}, id("type-name")}] = StateID{5};
|
|
B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0});
|
|
B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1});
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
|
|
glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
|
|
TestLang);
|
|
|
|
// Verify that the stack heads are joint at state 5 after reduces.
|
|
EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
|
|
AllOf(state(5),
|
|
parsedSymbolID(id("type-name")),
|
|
parents({GSSNode1, GSSNode2}))))
|
|
<< Heads;
|
|
// Verify that we create an ambiguous ForestNode of two parses of `type-name`.
|
|
EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
|
|
"[ 1, end) type-name := <ambiguous>\n"
|
|
"[ 1, end) ├─type-name := class-name\n"
|
|
"[ 1, end) │ └─class-name := <opaque>\n"
|
|
"[ 1, end) └─type-name := enum-name\n"
|
|
"[ 1, end) └─enum-name := <opaque>\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, ReduceJoiningWithSameBase) {
|
|
// Before (joining due to same goto state, the same base):
|
|
// 0--1(class-name)--3(*)
|
|
// └--2(enum-name)--4(*)
|
|
// After reducing 3 by `pointer := class-name *` and
|
|
// 2 by `pointer := enum-name *`:
|
|
// 0--5(pointer) // 5 is goto(0, pointer)
|
|
buildGrammar({"pointer", "class-name", "enum-name"},
|
|
{"pointer := class-name *", "pointer := enum-name *"});
|
|
|
|
auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
|
|
auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
|
|
auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
|
|
|
|
const auto *GSSNode0 =
|
|
GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
|
const auto *GSSNode1 =
|
|
GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
|
|
/*Parents=*/{GSSNode0});
|
|
const auto *GSSNode2 =
|
|
GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
|
|
/*Parents=*/{GSSNode0});
|
|
const auto *GSSNode3 =
|
|
GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
|
|
/*Parents=*/{GSSNode1});
|
|
const auto *GSSNode4 =
|
|
GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
|
|
/*Parents=*/{GSSNode2});
|
|
|
|
// FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{0}, id("pointer")}] = StateID{5};
|
|
B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0});
|
|
B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1});
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
|
|
glrReduce(Heads, tokenSymbol(tok::eof),
|
|
{emptyTokenStream(), Arena, GSStack}, TestLang);
|
|
|
|
EXPECT_THAT(
|
|
Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
|
|
AllOf(state(5), parsedSymbolID(id("pointer")),
|
|
parents({GSSNode0}))))
|
|
<< Heads;
|
|
EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
|
|
"[ 0, end) pointer := <ambiguous>\n"
|
|
"[ 0, end) ├─pointer := class-name *\n"
|
|
"[ 0, 1) │ ├─class-name := <opaque>\n"
|
|
"[ 1, end) │ └─* := tok[1]\n"
|
|
"[ 0, end) └─pointer := enum-name *\n"
|
|
"[ 0, 1) ├─enum-name := <opaque>\n"
|
|
"[ 1, end) └─* := tok[1]\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, ReduceLookahead) {
|
|
// A term can be followed by +, but not by -.
|
|
buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{0}, id("term")}] = StateID{2};
|
|
B.Reduce[StateID{1}].insert(RuleID{0});
|
|
TestLang.Table = std::move(B).build();
|
|
|
|
auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0);
|
|
|
|
const auto *Root =
|
|
GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
|
|
const auto *GSSNode1 =
|
|
GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root});
|
|
|
|
// When the lookahead is +, reduce is performed.
|
|
std::vector<const GSS::Node *> Heads = {GSSNode1};
|
|
glrReduce(Heads, tokenSymbol(tok::plus), {emptyTokenStream(), Arena, GSStack},
|
|
TestLang);
|
|
EXPECT_THAT(Heads,
|
|
ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
|
|
parents(Root))));
|
|
|
|
// When the lookahead is -, reduce is not performed.
|
|
Heads = {GSSNode1};
|
|
glrReduce(Heads, tokenSymbol(tok::minus),
|
|
{emptyTokenStream(), Arena, GSStack}, TestLang);
|
|
EXPECT_THAT(Heads, ElementsAre(GSSNode1));
|
|
}
|
|
|
|
TEST_F(GLRTest, Recover) {
|
|
// Recovery while parsing "word" inside braces.
|
|
// Before:
|
|
// 0--1({)--2(?)
|
|
// After recovering a `word` at state 1:
|
|
// 0--3(word) // 3 is goto(1, word)
|
|
buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"});
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{1}, id("word")}] = StateID{3};
|
|
B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
|
|
TestLang.Table = std::move(B).build();
|
|
TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
|
|
|
|
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
|
|
auto *Question1 = &Arena.createTerminal(tok::question, 1);
|
|
const auto *Root = GSStack.addNode(0, nullptr, {});
|
|
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
|
|
const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
|
|
|
|
// Need a token stream with paired braces so the strategy works.
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
|
|
pairBrackets(Tokens);
|
|
std::vector<const GSS::Node *> NewHeads;
|
|
|
|
unsigned TokenIndex = 2;
|
|
glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
|
|
NewHeads);
|
|
EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
|
|
EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
|
|
parents({OpenedBraces}), start(1u))));
|
|
EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
|
|
|
|
// Test recovery failure: omit closing brace so strategy fails
|
|
TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
|
|
pairBrackets(NoRBrace);
|
|
NewHeads.clear();
|
|
TokenIndex = 2;
|
|
glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
|
|
NewHeads);
|
|
EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
|
|
EXPECT_THAT(NewHeads, IsEmpty());
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoverRightmost) {
|
|
// In a nested block structure, we recover at the innermost possible block.
|
|
// Before:
|
|
// 0--1({)--1({)--1({)
|
|
// After recovering a `block` at inside the second braces:
|
|
// 0--1({)--2(body) // 2 is goto(1, body)
|
|
buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"});
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{1}, id("body")}] = StateID{2};
|
|
B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}});
|
|
TestLang.Table = std::move(B).build();
|
|
TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
|
|
|
|
clang::LangOptions LOptions;
|
|
// Innermost brace is unmatched, to test fallback to next brace.
|
|
TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
|
|
Tokens.tokens()[0].Pair = 5;
|
|
Tokens.tokens()[1].Pair = 4;
|
|
Tokens.tokens()[4].Pair = 1;
|
|
Tokens.tokens()[5].Pair = 0;
|
|
|
|
auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
|
|
auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
|
|
auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
|
|
const auto *Root = GSStack.addNode(0, nullptr, {});
|
|
const auto *In1 = GSStack.addNode(1, Brace1, {Root});
|
|
const auto *In2 = GSStack.addNode(1, Brace2, {In1});
|
|
const auto *In3 = GSStack.addNode(1, Brace3, {In2});
|
|
|
|
unsigned TokenIndex = 3;
|
|
std::vector<const GSS::Node *> NewHeads;
|
|
glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads);
|
|
EXPECT_EQ(TokenIndex, 5u);
|
|
EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
|
|
parents({In2}), start(2u))));
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoverAlternatives) {
|
|
// Recovery inside braces with multiple equally good options
|
|
// Before:
|
|
// 0--1({)
|
|
// After recovering either `word` or `number` inside the braces:
|
|
// 0--1({)--2(word) // 2 is goto(1, word)
|
|
// └--3(number) // 3 is goto(1, number)
|
|
buildGrammar({"number", "word", "top"},
|
|
{
|
|
"top := { number [recover=Braces] }",
|
|
"top := { word [recover=Braces] }",
|
|
});
|
|
LRTable::Builder B(TestLang.G);
|
|
B.Transition[{StateID{1}, id("number")}] = StateID{2};
|
|
B.Transition[{StateID{1}, id("word")}] = StateID{3};
|
|
B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}});
|
|
B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
|
|
TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
|
|
TestLang.Table = std::move(B).build();
|
|
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
|
|
const auto *Root = GSStack.addNode(0, nullptr, {});
|
|
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
|
|
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
|
|
pairBrackets(Tokens);
|
|
std::vector<const GSS::Node *> NewHeads;
|
|
unsigned TokenIndex = 1;
|
|
|
|
glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
|
|
NewHeads);
|
|
EXPECT_EQ(TokenIndex, 2u);
|
|
EXPECT_THAT(NewHeads,
|
|
UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
|
|
parents({OpenedBraces}), start(1u)),
|
|
AllOf(state(3), parsedSymbolID(id("word")),
|
|
parents({OpenedBraces}), start(1u))));
|
|
}
|
|
|
|
TEST_F(GLRTest, PerfectForestNodeSharing) {
|
|
// Run the GLR on a simple grammar and test that we build exactly one forest
|
|
// node per (SymbolID, token range).
|
|
|
|
// This is a grmammar where the original parsing-stack-based forest node
|
|
// sharing approach will fail. In its LR0 graph, it has two states containing
|
|
// item `expr := • IDENTIFIER`, and both have different goto states on the
|
|
// nonterminal `expr`.
|
|
build(R"bnf(
|
|
_ := test EOF
|
|
|
|
test := { expr
|
|
test := { IDENTIFIER
|
|
test := left-paren expr
|
|
left-paren := {
|
|
expr := IDENTIFIER
|
|
)bnf");
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
clang::LangOptions LOptions;
|
|
const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
|
|
// Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
|
|
// in the forest, see the `#1` and `=#1` in the dump string.
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) test := <ambiguous>\n"
|
|
"[ 0, end) ├─test := { expr\n"
|
|
"[ 0, 1) │ ├─{ := tok[0]\n"
|
|
"[ 1, end) │ └─expr := IDENTIFIER #1\n"
|
|
"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
|
|
"[ 0, end) ├─test := { IDENTIFIER\n"
|
|
"[ 0, 1) │ ├─{ := tok[0]\n"
|
|
"[ 1, end) │ └─IDENTIFIER := tok[1]\n"
|
|
"[ 0, end) └─test := left-paren expr\n"
|
|
"[ 0, 1) ├─left-paren := {\n"
|
|
"[ 0, 1) │ └─{ := tok[0]\n"
|
|
"[ 1, end) └─expr =#1\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, GLRReduceOrder) {
|
|
// Given the following grammar, and the input `IDENTIFIER`, reductions should
|
|
// be performed in the following order:
|
|
// 1. foo := IDENTIFIER
|
|
// 2. { test := IDENTIFIER, test := foo }
|
|
// foo should be reduced first, so that in step 2 we have completed reduces
|
|
// for test, and form an ambiguous forest node.
|
|
build(R"bnf(
|
|
_ := test EOF
|
|
|
|
test := IDENTIFIER
|
|
test := foo
|
|
foo := IDENTIFIER
|
|
)bnf");
|
|
clang::LangOptions LOptions;
|
|
const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) test := <ambiguous>\n"
|
|
"[ 0, end) ├─test := IDENTIFIER\n"
|
|
"[ 0, end) │ └─IDENTIFIER := tok[0]\n"
|
|
"[ 0, end) └─test := foo\n"
|
|
"[ 0, end) └─foo := IDENTIFIER\n"
|
|
"[ 0, end) └─IDENTIFIER := tok[0]\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoveryEndToEnd) {
|
|
// Simple example of brace-based recovery showing:
|
|
// - recovered region includes tokens both ahead of and behind the cursor
|
|
// - multiple possible recovery rules
|
|
// - recovery from outer scopes is rejected
|
|
build(R"bnf(
|
|
_ := block EOF
|
|
|
|
block := { block [recover=Braces] }
|
|
block := { numbers [recover=Braces] }
|
|
numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
|
|
)bnf");
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
|
|
pairBrackets(Tokens);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("block"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) block := { block [recover=Braces] }\n"
|
|
"[ 0, 1) ├─{ := tok[0]\n"
|
|
"[ 1, 5) ├─block := <ambiguous>\n"
|
|
"[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
|
|
"[ 1, 2) │ │ ├─{ := tok[1]\n"
|
|
"[ 2, 4) │ │ ├─block := <opaque>\n"
|
|
"[ 4, 5) │ │ └─} := tok[4]\n"
|
|
"[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
|
|
"[ 1, 2) │ ├─{ := tok[1]\n"
|
|
"[ 2, 4) │ ├─numbers := <opaque>\n"
|
|
"[ 4, 5) │ └─} := tok[4]\n"
|
|
"[ 5, end) └─} := tok[5]\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoverTerminal) {
|
|
build(R"bnf(
|
|
_ := stmt EOF
|
|
|
|
stmt := IDENTIFIER ; [recover=Skip]
|
|
)bnf");
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
TestLang.RecoveryStrategies.try_emplace(
|
|
extensionID("Skip"),
|
|
[](Token::Index Start, const TokenStream &) { return Start; });
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("stmt"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
|
|
"[ 0, 1) ├─IDENTIFIER := tok[0]\n"
|
|
"[ 1, end) └─; := <opaque>\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoverUnrestrictedReduce) {
|
|
// Here, ! is not in any rule and therefore not in the follow set of `word`.
|
|
// We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
|
|
|
|
build(R"bnf(
|
|
_ := sentence EOF
|
|
|
|
word := IDENTIFIER
|
|
sentence := word word [recover=AcceptAnyTokenInstead]
|
|
)bnf");
|
|
|
|
clang::LangOptions LOptions;
|
|
const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
TestLang.RecoveryStrategies.try_emplace(
|
|
extensionID("AcceptAnyTokenInstead"),
|
|
[](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
|
|
"[ 0, 1) ├─word := IDENTIFIER\n"
|
|
"[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
|
|
"[ 1, end) └─word := <opaque>\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, RecoveryFromStartOfInput) {
|
|
build(R"bnf(
|
|
_ := start [recover=Fallback] EOF
|
|
|
|
start := IDENTIFIER
|
|
)bnf");
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
bool fallback_recovered = false;
|
|
auto fallback = [&](Token::Index Start, const TokenStream & Code) {
|
|
fallback_recovered = true;
|
|
return Code.tokens().size();
|
|
};
|
|
TestLang.RecoveryStrategies.try_emplace(
|
|
extensionID("Fallback"),
|
|
fallback);
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("?", LOptions), LOptions);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("start"), TestLang);
|
|
EXPECT_TRUE(fallback_recovered);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) start := <opaque>\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, RepeatedRecovery) {
|
|
// We require multiple steps of recovery at eof and then a reduction in order
|
|
// to successfully parse.
|
|
build(R"bnf(
|
|
_ := function EOF
|
|
# FIXME: this forces EOF to be in follow(signature).
|
|
# Remove it once we use unconstrained reduction for recovery.
|
|
_ := signature EOF
|
|
|
|
function := signature body [recover=Skip]
|
|
signature := IDENTIFIER params [recover=Skip]
|
|
params := ( )
|
|
body := { }
|
|
)bnf");
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
TestLang.RecoveryStrategies.try_emplace(
|
|
extensionID("Skip"),
|
|
[](Token::Index Start, const TokenStream &) { return Start; });
|
|
clang::LangOptions LOptions;
|
|
TokenStream Tokens = cook(lex("main", LOptions), LOptions);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("function"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) function := signature body [recover=Skip]\n"
|
|
"[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
|
|
"[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
|
|
"[ 1, 1) │ └─params := <opaque>\n"
|
|
"[ 1, end) └─body := <opaque>\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, NoExplicitAccept) {
|
|
build(R"bnf(
|
|
_ := test EOF
|
|
|
|
test := IDENTIFIER test
|
|
test := IDENTIFIER
|
|
)bnf");
|
|
clang::LangOptions LOptions;
|
|
// Given the following input, and the grammar above, we perform two reductions
|
|
// of the nonterminal `test` when the next token is `eof`, verify that the
|
|
// parser stops at the right state.
|
|
const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
|
|
const ForestNode &Parsed =
|
|
glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
|
|
EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
|
|
"[ 0, end) test := IDENTIFIER test\n"
|
|
"[ 0, 1) ├─IDENTIFIER := tok[0]\n"
|
|
"[ 1, end) └─test := IDENTIFIER\n"
|
|
"[ 1, end) └─IDENTIFIER := tok[1]\n");
|
|
}
|
|
|
|
TEST_F(GLRTest, GuardExtension) {
|
|
build(R"bnf(
|
|
_ := start EOF
|
|
|
|
start := IDENTIFIER [guard]
|
|
)bnf");
|
|
TestLang.Guards.try_emplace(
|
|
ruleFor("start"), [&](const GuardParams &P) {
|
|
assert(P.RHS.size() == 1 &&
|
|
P.RHS.front()->symbol() ==
|
|
tokenSymbol(clang::tok::identifier));
|
|
return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
|
|
.text() == "test";
|
|
});
|
|
clang::LangOptions LOptions;
|
|
TestLang.Table = LRTable::buildSLR(TestLang.G);
|
|
|
|
std::string Input = "test";
|
|
const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
|
|
EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
|
|
.dumpRecursive(TestLang.G),
|
|
"[ 0, end) start := IDENTIFIER [guard]\n"
|
|
"[ 0, end) └─IDENTIFIER := tok[0]\n");
|
|
|
|
Input = "notest";
|
|
const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
|
|
EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
|
|
.dumpRecursive(TestLang.G),
|
|
"[ 0, end) start := <opaque>\n");
|
|
}
|
|
|
|
TEST(GSSTest, GC) {
|
|
// ┌-A-┬-AB
|
|
// ├-B-┘
|
|
// Root-+-C
|
|
// ├-D
|
|
// └-E
|
|
GSS GSStack;
|
|
auto *Root = GSStack.addNode(0, nullptr, {});
|
|
auto *A = GSStack.addNode(0, nullptr, {Root});
|
|
auto *B = GSStack.addNode(0, nullptr, {Root});
|
|
auto *C = GSStack.addNode(0, nullptr, {Root});
|
|
auto *D = GSStack.addNode(0, nullptr, {Root});
|
|
auto *AB = GSStack.addNode(0, nullptr, {A, B});
|
|
|
|
EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
|
|
EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
|
|
auto *E = GSStack.addNode(0, nullptr, {Root});
|
|
EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
|
|
EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
|
|
EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace pseudo
|
|
} // namespace clang
|