From b82068f6b8bf3d5da2e1e30474bf50086620d23c Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Mon, 16 Jun 2014 22:56:43 +0200 Subject: [PATCH] regexp string parser and composer in hierarchy --- alib2/src/regexp/RegExpFromStringLexer.cpp | 84 ++------ alib2/src/regexp/RegExpFromStringLexer.h | 7 +- alib2/src/regexp/RegExpFromStringParser.cpp | 218 ++++++++++++-------- alib2/src/regexp/RegExpFromStringParser.h | 11 +- alib2/src/regexp/RegExpToStringComposer.cpp | 62 +++--- alib2/src/regexp/RegExpToStringComposer.h | 7 +- alib2/test-src/regexp/RegExpTest.cpp | 58 +++++- 7 files changed, 238 insertions(+), 209 deletions(-) diff --git a/alib2/src/regexp/RegExpFromStringLexer.cpp b/alib2/src/regexp/RegExpFromStringLexer.cpp index 9f11b08279..debd44b1a2 100644 --- a/alib2/src/regexp/RegExpFromStringLexer.cpp +++ b/alib2/src/regexp/RegExpFromStringLexer.cpp @@ -2,118 +2,72 @@ namespace regexp { -RegExpFromStringLexer::RegExpFromStringLexer(const std::string& in) : m_In(in) { - this->next(); +RegExpFromStringLexer::RegExpFromStringLexer(std::stringstream& in) : m_In(in) { + m_Current.type = TokenType::ERROR; + m_Current.value = ""; } RegExpFromStringLexer& RegExpFromStringLexer::next() { char character; m_Current.value = ""; + std::streampos pos = m_In.tellg(); L0: character = m_In.get(); if(m_In.eof()) { + m_In.seekg(pos); m_Current.type = TokenType::TEOF; return *this; } else if(character == ' ' || character == '\n' || character == '\t') { goto L0; - } else if(character == '"') { - goto L3; - } else if((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') || (character >= '0' && character <= '9')) { - m_Current.type = TokenType::SYMBOL; - m_Current.value += character; - goto L2; } else if(character == '(') { m_Current.type = TokenType::LPAR; + m_Current.value += character; return *this; } else if(character == ')') { m_Current.type = TokenType::RPAR; + m_Current.value += character; return *this; } else if(character == '+') { m_Current.type = TokenType::PLUS; + m_Current.value += character; return *this; } else if(character == '*') { m_Current.type = TokenType::STAR; + m_Current.value += character; return *this; - } else if(character == '\\') { + } else if(character == '#') { + m_Current.value += character; goto L1; } else { - m_In.unget(); + m_In.seekg(pos); m_Current.type = TokenType::ERROR; return *this; } L1: character = m_In.get(); if(m_In.eof()) { + m_In.seekg(pos); m_Current.type = TokenType::ERROR; return *this; - } else if(character == 'e') { + } else if(character == 'E') { + m_Current.value += character; m_Current.type = TokenType::EPS; return *this; } else if(character == '0') { - m_Current.type = TokenType::EMPTY; - return *this; - } else { - m_In.unget(); - m_Current.type = TokenType::ERROR; - return *this; - } -L2: - character = m_In.get(); - if(m_In.eof()) { - return *this; - } else if((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') || (character >= '0' && character <= '9')) { m_Current.value += character; - goto L2; - } else { - m_In.unget(); - return *this; - } -L3: - character = m_In.get(); - if(m_In.eof()) { - m_Current.type = TokenType::ERROR; - return *this; - } else if(character == '"') { - m_Current.type = TokenType::EPS; + m_Current.type = TokenType::EMPTY; return *this; - } else if(character == '\\') { - m_Current.type = TokenType::SYMBOL; - goto L5; } else { - m_Current.type = TokenType::SYMBOL; - m_Current.value += character; - goto L4; - } -L4: - character = m_In.get(); - if(m_In.eof()) { + m_In.seekg(pos); m_Current.type = TokenType::ERROR; return *this; - } else if(character == '"') { - return *this; - } else if(character == '\\') { - goto L5; - } else { - m_Current.value += character; - goto L4; } -L5: - character = m_In.get(); - if(m_In.eof()) { - m_Current.type = TokenType::ERROR; - return *this; - } else if(character == '"' || character == '\\') { - m_Current.value += character; - goto L4; - } else { - m_Current.type = TokenType::ERROR; - return *this; - } } RegExpFromStringLexer::Token RegExpFromStringLexer::token() { return m_Current; } -} +} /* namespace regexp */ + diff --git a/alib2/src/regexp/RegExpFromStringLexer.h b/alib2/src/regexp/RegExpFromStringLexer.h index 4b41808b89..411f95ac8c 100644 --- a/alib2/src/regexp/RegExpFromStringLexer.h +++ b/alib2/src/regexp/RegExpFromStringLexer.h @@ -9,7 +9,6 @@ namespace regexp { class RegExpFromStringLexer { public: enum class TokenType { - SYMBOL, LPAR, RPAR, PLUS, @@ -25,11 +24,11 @@ public: std::string value; }; private: - std::stringstream m_In; + std::stringstream& m_In; Token m_Current; public: - - RegExpFromStringLexer(const std::string&); + + RegExpFromStringLexer(std::stringstream&); RegExpFromStringLexer& next(); Token token(); }; diff --git a/alib2/src/regexp/RegExpFromStringParser.cpp b/alib2/src/regexp/RegExpFromStringParser.cpp index 2d5f79ea72..4f4d4faedd 100644 --- a/alib2/src/regexp/RegExpFromStringParser.cpp +++ b/alib2/src/regexp/RegExpFromStringParser.cpp @@ -4,124 +4,162 @@ namespace regexp { -RegExpFromStringParser::RegExpFromStringParser(const std::string& lexer) : m_Lexer(lexer) { +RegExpFromStringParser::RegExpFromStringParser(std::stringstream& input) : m_RegexpLexer(input), m_SymbolParser(input) { } -RegExp RegExpFromStringParser::parse() { - RegExpElement* element = this->alternation(); - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type != RegExpFromStringLexer::TokenType::TEOF) throw alib::AlibException(); - RegExp regexp = RegExp(std::move(*element)); - delete element; - return regexp; +RegExp* RegExpFromStringParser::parse() { + m_RegexpLexer.next(); + RegExpElement* element = this->alternation(); + if(element != NULL) { + RegExp* regexp = new RegExp(std::move(*element)); + delete element; + return regexp; + } else { + return NULL; + } } +RegExp RegExpFromStringParser::parseRegexp() { + RegExp* regexp = parse(); + + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::TEOF && regexp != NULL) { + return *regexp; + } else { + throw alib::AlibException(); + } +} + + RegExpElement* RegExpFromStringParser::alternation() { - return this->alternationCont(this->concatenation()); + RegExpElement* concatenation = this->concatenation(); + if(concatenation != NULL) { + return this->alternationCont(concatenation); + } else { + return NULL; + } } RegExpElement* RegExpFromStringParser::alternationCont(RegExpElement* left) { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::PLUS) { - m_Lexer.next(); - - RegExpElement* right = this->concatenation(); - Alternation* res = new Alternation(std::move(*left), std::move(*right)); - - delete left; - delete right; - - return this->alternationContCont(res); - } else { - return left; - } + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::PLUS) { + m_RegexpLexer.next(); + + RegExpElement* right = this->concatenation(); + if(right != NULL) { + Alternation* res = new Alternation(std::move(*left), std::move(*right)); + + delete left; + delete right; + return this->alternationContCont(res); + } else { + delete left; + return NULL; + } + } else { + return left; + } } RegExpElement* RegExpFromStringParser::alternationContCont(Alternation* res) { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::PLUS) { - m_Lexer.next(); - - RegExpElement* next = this->concatenation(); - res->appendElement(std::move(*next)); - delete next; - - return this->alternationContCont(res); - } else { - return res; - } + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::PLUS) { + m_RegexpLexer.next(); + + RegExpElement* next = this->concatenation(); + if(next != NULL) { + res->appendElement(std::move(*next)); + delete next; + } else { + return NULL; + } + + return this->alternationContCont(res); + } else { + return res; + } } RegExpElement* RegExpFromStringParser::concatenation() { - return this->concatenationCont(this->factor()); + RegExpElement* factor = this->factor(); + if(factor != NULL) { + return this->concatenationCont(factor); + } else { + return NULL; + } } RegExpElement* RegExpFromStringParser::concatenationCont(RegExpElement* left) { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::SYMBOL || token.type == RegExpFromStringLexer::TokenType::LPAR || token.type == RegExpFromStringLexer::TokenType::EPS || token.type == RegExpFromStringLexer::TokenType::EMPTY) { - - RegExpElement* right = this->factor(); - Concatenation* res = new Concatenation(std::move(*left), std::move(*right)); - - delete left; - delete right; - - return this->concatenationContCont(res); - } else { - return left; - } + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::ERROR || token.type == RegExpFromStringLexer::TokenType::LPAR || token.type == RegExpFromStringLexer::TokenType::EPS || token.type == RegExpFromStringLexer::TokenType::EMPTY) { + + RegExpElement* right = this->factor(); + if(right != NULL) { + Concatenation* res = new Concatenation(std::move(*left), std::move(*right)); + + delete left; + delete right; + return this->concatenationContCont(res); + } else { + delete left; + return NULL; + } + } else { + return left; + } } RegExpElement* RegExpFromStringParser::concatenationContCont(Concatenation* res) { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::SYMBOL || token.type == RegExpFromStringLexer::TokenType::LPAR || token.type == RegExpFromStringLexer::TokenType::EPS || token.type == RegExpFromStringLexer::TokenType::EMPTY) { - - RegExpElement* next = this->factor(); - res->appendElement(std::move(*next)); - delete next; - - return this->concatenationContCont(res); - } else { - return res; - } + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::ERROR || token.type == RegExpFromStringLexer::TokenType::LPAR || token.type == RegExpFromStringLexer::TokenType::EPS || token.type == RegExpFromStringLexer::TokenType::EMPTY) { + + RegExpElement* next = this->factor(); + if(next != NULL) { + res->appendElement(std::move(*next)); + delete next; + } else { + return NULL; + } + return this->concatenationContCont(res); + } else { + return res; + } } RegExpElement* RegExpFromStringParser::factor() { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::LPAR) { - m_Lexer.next(); - RegExpElement* base = this->alternation(); - token = m_Lexer.token(); - if(token.type != RegExpFromStringLexer::TokenType::RPAR) throw alib::AlibException(); - m_Lexer.next(); - return this->star(base); - } else if(token.type == RegExpFromStringLexer::TokenType::EPS) { - m_Lexer.next(); - return this->star(new RegExpEpsilon()); - } else if(token.type == RegExpFromStringLexer::TokenType::EMPTY) { - m_Lexer.next(); - return this->star(new RegExpEmpty()); - } else if(token.type == RegExpFromStringLexer::TokenType::SYMBOL) { - std::string symbol = token.value; - m_Lexer.next(); - return this->star(new RegExpSymbol(alphabet::Symbol(alphabet::LabeledSymbol(label::Label(label::StringLabel(symbol)))))); - } else { - throw alib::AlibException(); - } + RegExpFromStringLexer::Token token = m_RegexpLexer.token(); + if(token.type == RegExpFromStringLexer::TokenType::LPAR) { + m_RegexpLexer.next(); + RegExpElement* base = this->alternation(); + token = m_RegexpLexer.token(); + if(token.type != RegExpFromStringLexer::TokenType::RPAR) return NULL; + return this->star(base); + } else if(token.type == RegExpFromStringLexer::TokenType::EPS) { + return this->star(new RegExpEpsilon()); + } else if(token.type == RegExpFromStringLexer::TokenType::EMPTY) { + return this->star(new RegExpEmpty()); + } else if(token.type == RegExpFromStringLexer::TokenType::ERROR) { + alphabet::Symbol* symbol = m_SymbolParser.parse(); + if(symbol != NULL) + return this->star(new RegExpSymbol(*symbol)); + else + return NULL; + } else { + return NULL; + } } RegExpElement* RegExpFromStringParser::star(RegExpElement* elem) { - RegExpFromStringLexer::Token token = m_Lexer.token(); - if(token.type == RegExpFromStringLexer::TokenType::STAR) { - m_Lexer.next(); - Iteration* iter = new Iteration(std::move(*elem)); - delete elem; - return this->star(iter); - } else { - return elem; - } + RegExpFromStringLexer::Token token = m_RegexpLexer.next().token(); + if(token.type == RegExpFromStringLexer::TokenType::STAR) { + Iteration* iter = new Iteration(std::move(*elem)); + delete elem; + return this->star(iter); + } else { + return elem; + } } } diff --git a/alib2/src/regexp/RegExpFromStringParser.h b/alib2/src/regexp/RegExpFromStringParser.h index 277c61768a..910ce2821e 100644 --- a/alib2/src/regexp/RegExpFromStringParser.h +++ b/alib2/src/regexp/RegExpFromStringParser.h @@ -12,6 +12,7 @@ #include "RegExpElements.h" #include "RegExpFromStringLexer.h" +#include "../alphabet/SymbolFromStringParser.h" namespace regexp { @@ -27,11 +28,13 @@ class RegExpFromStringParser { RegExpElement* factor(); RegExpElement* star(RegExpElement* elem); - RegExpFromStringLexer m_Lexer; - + RegExpFromStringLexer m_RegexpLexer; + alphabet::SymbolFromStringParser m_SymbolParser; + public: - RegExpFromStringParser(const std::string&); - RegExp parse(); + RegExpFromStringParser(std::stringstream&); + RegExp* parse(); + RegExp parseRegexp(); }; diff --git a/alib2/src/regexp/RegExpToStringComposer.cpp b/alib2/src/regexp/RegExpToStringComposer.cpp index 0e610d7548..dc1c188c25 100644 --- a/alib2/src/regexp/RegExpToStringComposer.cpp +++ b/alib2/src/regexp/RegExpToStringComposer.cpp @@ -8,6 +8,7 @@ #include <sstream> #include <algorithm> #include "RegExpToStringComposer.h" +#include "../alphabet/SymbolToStringComposer.h" namespace regexp { @@ -20,84 +21,73 @@ void RegExpToStringComposer::Visit(void* userData, const RegExpElement::element_ } void RegExpToStringComposer::Visit(void* userData, const Alternation& alternation) { - std::stringstream &out = *((std::stringstream*) userData); + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); - out << '('; + Priority outerPriorityMinimum = out.first; + if(outerPriorityMinimum == Priority::CONCATENATION || outerPriorityMinimum == Priority::FACTOR) out.second << '('; bool first = true; for (const auto& element : alternation.getElements()) { if(first) { first = false; } else { - out << '+'; + out.second << '+'; } const RegExpElement::element_type& object = static_cast<const RegExpElement::element_type&>(*element); + out.first = Priority::ALTERNATION; object.Accept(userData, *this); } - out << ')'; + if(outerPriorityMinimum == Priority::CONCATENATION || outerPriorityMinimum == Priority::FACTOR) out.second << ')'; } void RegExpToStringComposer::Visit(void* userData, const Concatenation& concatenation) { - std::stringstream &out = *((std::stringstream*) userData); + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); + Priority outerPriorityMinimum = out.first; + if(outerPriorityMinimum == Priority::FACTOR) out.second << '('; bool first = true; for (auto element : concatenation.getElements()) { if(first) first = false; else - out << ' '; + out.second << ' '; const RegExpElement::element_type& object = static_cast<const RegExpElement::element_type&>(*element); + out.first = Priority::CONCATENATION; object.Accept(userData, *this); } + if(outerPriorityMinimum == Priority::FACTOR) out.second << ')'; } void RegExpToStringComposer::Visit(void* userData, const Iteration& iteration) { - std::stringstream &out = *((std::stringstream*) userData); + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); - out << '('; const RegExpElement::element_type& object = static_cast<const RegExpElement::element_type&>(iteration.getElement()); + out.first = Priority::FACTOR; object.Accept(userData, *this); - out << ")*"; + out.second << "*"; } void RegExpToStringComposer::Visit(void* userData, const RegExpSymbol& symbol) { - std::stringstream &out = *((std::stringstream*) userData); + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); - auto testEscape = [](char c) { - if(c == '"' || c == '\\' || c == ' ') return true; - return false; - }; - - auto replace = [](std::string& str, const std::string& what, const std::string& with) { - size_t index = 0; - while((index = str.find(what, index)) != std::string::npos) { - str.replace(index, what.length(), with); - index += with.length(); - } - }; - std::string tmp = (std::string) symbol.getSymbol(); - if( std::any_of(tmp.begin(), tmp.end(), testEscape) ) { - replace(tmp, "\\", "\\\\" ); - replace(tmp, "\"", "\\\"" ); - out << '"' << tmp << '"'; - } else { - out << tmp; - } + alphabet::SymbolToStringComposer composer; + out.second << composer.compose(symbol.getSymbol()); } void RegExpToStringComposer::Visit(void* userData, const RegExpEpsilon&) { - std::stringstream &out = *((std::stringstream*) userData); - out << "\\e"; + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); + out.second << "#E"; } void RegExpToStringComposer::Visit(void* userData, const RegExpEmpty&) { - std::stringstream &out = *((std::stringstream*) userData); - out << "\\0"; + std::pair<Priority, std::stringstream> &out = *((std::pair<Priority, std::stringstream>*) userData); + out.second << "#0"; } std::string RegExpToStringComposer::compose(const RegExp& regexp) { - std::stringstream out; + std::pair<Priority, std::stringstream> out; + out.first = Priority::ALTERNATION; regexp.getRegExp().Accept((void*) &out, *this); - return std::move(out).str(); + return std::move(out.second).str(); } } /* namespace regexp */ diff --git a/alib2/src/regexp/RegExpToStringComposer.h b/alib2/src/regexp/RegExpToStringComposer.h index 4eba2a2d49..00f23cdee7 100644 --- a/alib2/src/regexp/RegExpToStringComposer.h +++ b/alib2/src/regexp/RegExpToStringComposer.h @@ -26,7 +26,12 @@ class RegExpToStringComposer : public RegExpElement::visitor_type { void Visit(void*, const RegExp& empty); - + + enum class Priority { + ALTERNATION, + CONCATENATION, + FACTOR + }; public: /** * Composes string representation of RegExp. diff --git a/alib2/test-src/regexp/RegExpTest.cpp b/alib2/test-src/regexp/RegExpTest.cpp index b1963615ff..5743976036 100644 --- a/alib2/test-src/regexp/RegExpTest.cpp +++ b/alib2/test-src/regexp/RegExpTest.cpp @@ -49,20 +49,60 @@ void RegExpTest::testCopyConstruct() { } void RegExpTest::testEqual() { - std::string input = "a+a"; + { + std::string input = "#E* #0*"; + std::stringstream inputs(input); - regexp::RegExpFromStringParser parser(input); - regexp::RegExp regexp = parser.parse(); + regexp::RegExpFromStringParser parser(inputs); + regexp::RegExp regexp = parser.parseRegexp(); - regexp::RegExpToStringComposer composer; - std::string output = composer.compose(regexp); + regexp::RegExpToStringComposer composer; + std::string output = composer.compose(regexp); + std::stringstream outputs(output); -// CPPUNIT_ASSERT( input == output ); + CPPUNIT_ASSERT( input == output ); - regexp::RegExpFromStringParser parser2(output); - regexp::RegExp regexp2 = parser2.parse(); + regexp::RegExpFromStringParser parser2(outputs); + regexp::RegExp regexp2 = parser2.parseRegexp(); - CPPUNIT_ASSERT( regexp == regexp2 ); + CPPUNIT_ASSERT( regexp == regexp2 ); + } + { + std::string input = "a+a"; + std::stringstream inputs(input); + + regexp::RegExpFromStringParser parser(inputs); + regexp::RegExp regexp = parser.parseRegexp(); + + regexp::RegExpToStringComposer composer; + std::string output = composer.compose(regexp); + std::stringstream outputs(output); + + CPPUNIT_ASSERT( input == output ); + + regexp::RegExpFromStringParser parser2(outputs); + regexp::RegExp regexp2 = parser2.parseRegexp(); + + CPPUNIT_ASSERT( regexp == regexp2 ); + } + { + std::string input = "a+a (b+a)*"; + std::stringstream inputs(input); + + regexp::RegExpFromStringParser parser(inputs); + regexp::RegExp regexp = parser.parseRegexp(); + + regexp::RegExpToStringComposer composer; + std::string output = composer.compose(regexp); + std::stringstream outputs(output); + + CPPUNIT_ASSERT( input == output ); + + regexp::RegExpFromStringParser parser2(outputs); + regexp::RegExp regexp2 = parser2.parseRegexp(); + + CPPUNIT_ASSERT( regexp == regexp2 ); + } } void RegExpTest::testXMLParser() { -- GitLab