From f6ea746bfab66d6bb43e155ba3dea4808d89b87e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Pecka?= <peckato1@fit.cvut.cz> Date: Sat, 6 Sep 2014 22:58:43 +0200 Subject: [PATCH] algo: brzozowski, thompson --- aconversions/src/re2fa/Brzozowski.cpp | 111 -------- aconversions/src/re2fa/Brzozowski.h | 58 ---- aconversions/src/re2fa/Thompson.cpp | 148 ---------- aconversions/src/re2fa/Thompson.h | 82 ------ aconversions2/src/ConversionHandler.cpp | 21 +- .../src/conversions/re2fa/Brzozowski.cpp | 128 +++++++++ alib2algo/src/conversions/re2fa/Brzozowski.h | 42 +++ alib2algo/src/conversions/re2fa/Thompson.cpp | 261 ++++++++++++++++++ alib2algo/src/conversions/re2fa/Thompson.h | 59 ++++ 9 files changed, 499 insertions(+), 411 deletions(-) delete mode 100644 aconversions/src/re2fa/Brzozowski.cpp delete mode 100644 aconversions/src/re2fa/Brzozowski.h delete mode 100644 aconversions/src/re2fa/Thompson.cpp delete mode 100644 aconversions/src/re2fa/Thompson.h create mode 100644 alib2algo/src/conversions/re2fa/Brzozowski.cpp create mode 100644 alib2algo/src/conversions/re2fa/Brzozowski.h create mode 100644 alib2algo/src/conversions/re2fa/Thompson.cpp create mode 100644 alib2algo/src/conversions/re2fa/Thompson.h diff --git a/aconversions/src/re2fa/Brzozowski.cpp b/aconversions/src/re2fa/Brzozowski.cpp deleted file mode 100644 index 8009aec9a1..0000000000 --- a/aconversions/src/re2fa/Brzozowski.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Brzozowski.cpp - * - * Created on: 11. 1. 2014 - * Author: tomas - */ - -#include "Brzozowski.h" - -using namespace std; -using namespace alib; -using namespace automaton; -using namespace regexp; - -namespace conversions -{ - -Brzozowski::Brzozowski( const RegExp & re ) : m_re( re ) -{ - -} - -Brzozowski::~Brzozowski( void ) -{ - -} - -FSM Brzozowski::convert( void ) -{ - RegExpOptimize opt; - - // 1. - RegExp V = opt.optimize( m_re ); - set<alphabet::Symbol> alphabet = m_re.getAlphabet( ); - - set<RegExp> Q = { V }; - deque<set<RegExp>> Qi; - - Qi.push_back( set<RegExp>( ) ); - Qi.at( 0 ).insert( V ); - - int i = 1; - - // 2. - while( ! Qi.at( i - 1 ).empty( ) ) - { - Qi.push_back( set<RegExp>( ) ); // initialize set Q_i - - for( const auto & regexp : Qi.at( i - 1 ) ) - { - RegExpDerivation deriv( regexp ); - - for( const auto & a : alphabet ) - { - RegExp derived = deriv.derivation( a ); - derived = opt.optimize( derived ); - - // this will also add \emptyset as a regexp (and as FA state) - if( ! isInSet( derived, Q ) ) // if this state has already been found, do not add - Qi.at( i ).insert( derived ); - - } - } - - Q.insert( Qi.at( i ).begin( ), Qi.at( i ).end( ) ); - - i += 1; - } - - // ------------------------------------------------------------------------ - // 3. - - FSM automaton; - int stateId = 0; - map<RegExp, State> stateMap; - - for( const auto & r : Q ) - { - State q( toBase26( stateId ++ ) ); - stateMap.insert( std::pair<RegExp,State>( r, q ) ); - automaton.addState( q ); - } - - for( const auto & a : alphabet ) - automaton.addInputSymbol( a.getSymbol( ) ); - - for( const auto & r : Q ) - { - RegExpDerivation deriv( r ); - - for( const auto & a: automaton.getInputAlphabet( ) ) - { - RegExp derived = deriv.derivation( a ); - derived = opt.optimize( derived ); - - TransitionFSM t( stateMap.find( r )->second, a, stateMap.find( derived )->second ); - if( ! isInSet( t, automaton.getTransitions( ) ) ) - automaton.addTransition( t ); - } - } - - automaton.addInitialState( stateMap.find( V )->second ); - - for( const auto & U : Q ) - if( U.containsEmptyString( ) ) - automaton.addFinalState( stateMap.find( U )->second ); - - return automaton; -} - -} /* namespace conversions */ diff --git a/aconversions/src/re2fa/Brzozowski.h b/aconversions/src/re2fa/Brzozowski.h deleted file mode 100644 index 732c6b3851..0000000000 --- a/aconversions/src/re2fa/Brzozowski.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Brzozowski.h - * - * Created on: 11. 1. 2014 - * Author: tomas - */ - -#ifndef BRZOZOWSKI_H_ -#define BRZOZOWSKI_H_ - -#include <map> -#include <set> -#include <string> -#include <deque> - -#include <automaton/State.h> -#include <AlibException.h> - -#include "../interface/IConversionFSM.h" -#include "../shared/Hexavigesimal.h" -#include "../include/macros.h" - -#include "RegExpDerivation.h" -#include "RegExpOptimize.h" - -namespace conversions -{ - -/** - * Converts regular expression to finite automaton using Brzozowski algorithm (derivations of regular expressions). - * Source: Melichar 2.110 - */ -class Brzozowski : public IConversionFSM -{ -public: - /** - * @param re Source regular expression. - */ - Brzozowski( const regexp::RegExp & re ); - - ~Brzozowski( void ); - - /** - * Performs conversion. - * @return FSM equivalent to original regular expression. - */ - automaton::FSM convert( void ); - -private: - /** - * input regexp - */ - const regexp::RegExp & m_re; -}; - -} /* namespace conversions */ - -#endif /* BRZOZOWSKI_H_ */ diff --git a/aconversions/src/re2fa/Thompson.cpp b/aconversions/src/re2fa/Thompson.cpp deleted file mode 100644 index b23f79c9e4..0000000000 --- a/aconversions/src/re2fa/Thompson.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Thompson.cpp - * - * Created on: 11. 1. 2014 - * Author: tomas - */ -#include "Thompson.h" - -using namespace alib; -using namespace automaton; -using namespace regexp; - -namespace conversions -{ - -Thompson::Thompson( const RegExp & re ) : m_re( re ) -{ - -} - -Thompson::~Thompson( void ) -{ - -} - - -FSM Thompson::convert( void ) -{ - m_fsm = FSM( ); - m_stateId = 0; - - for( const auto & symbol : m_re.getAlphabet( ) ) - m_fsm.addInputSymbol( symbol.getSymbol( ) ); - - SubexpressionTails st = processRegExpNode( m_re.getRegExp( ) ); - - m_fsm.addInitialState( st.m_head ); - m_fsm.addFinalState( st.m_tail ); - - return m_fsm; -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const RegExpElement * node ) -{ - const Alternation* alternation = dynamic_cast<const Alternation*>( node ); - const Concatenation* concatenation = dynamic_cast<const Concatenation*>( node ); - const Iteration* iteration = dynamic_cast<const Iteration*>( node ); - const RegExpSymbol* symbol = dynamic_cast<const RegExpSymbol*>( node ); - const RegExpEmpty* empty = dynamic_cast<const RegExpEmpty*>( node ); - const RegExpEpsilon* eps = dynamic_cast<const RegExpEpsilon*>( node ); - - if( alternation ) - return processRegExpNode( alternation ); - else if( concatenation ) - return processRegExpNode( concatenation ); - else if( iteration ) - return processRegExpNode( iteration ); - else if( symbol ) - return processRegExpNode( symbol ); - else if( eps ) - return processRegExpNode( eps ); - else if( empty ) - return processRegExpNode( empty ); - - throw AlibException( "Thompson::process - invalid RegExpElement node." ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const Iteration * node ) -{ - State head = m_fsm.createUniqueState( toBase26( m_stateId ) + "0", true ); - State tail = m_fsm.createUniqueState( toBase26( m_stateId ++ ) + "1", true ); - - SubexpressionTails st = processRegExpNode( node->getElement( ) ); - - m_fsm.addTransition( head, Symbol( "" ), st.m_head ); - m_fsm.addTransition( head, Symbol( "" ), tail ); - m_fsm.addTransition( st.m_tail, Symbol( "" ), tail ); - m_fsm.addTransition( st.m_tail, Symbol( "" ), st.m_head ); - - return SubexpressionTails( head, tail ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const Alternation * node ) -{ - State head = m_fsm.createUniqueState( toBase26( m_stateId ) + "0", true ); - State tail = m_fsm.createUniqueState( toBase26( m_stateId ++ ) + "1", true ); - - for( const auto & element : node->getElements( ) ) - { - SubexpressionTails st = processRegExpNode( element ); - - m_fsm.addTransition( head, Symbol( "" ), st.m_head ); - m_fsm.addTransition( st.m_tail, Symbol( "" ), tail ); - } - - return SubexpressionTails( head, tail ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const Concatenation * node ) -{ - vector<SubexpressionTails> st; - for( const auto & element : node->getElements( ) ) - st.push_back( processRegExpNode( element ) ); - - for( size_t i = 1; i < st.size( ); i ++ ) - m_fsm.addTransition( st[ i - 1 ].m_tail, Symbol( "" ), st[ i ].m_head ); - - return SubexpressionTails( st[ 0 ].m_head, st[ st.size( ) - 1 ].m_tail ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const RegExpSymbol * node ) -{ - Symbol symb( node->getSymbol( ) ); - State head = m_fsm.createUniqueState( toBase26( m_stateId ) + "0", true ); - State tail = m_fsm.createUniqueState( toBase26( m_stateId ++ ) + "1", true ); - - m_fsm.addTransition( head, symb, tail ); - - return SubexpressionTails( head, tail ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const RegExpEpsilon * node ) -{ - Symbol symb( "" ); - State head = m_fsm.createUniqueState( toBase26( m_stateId ) + "0", true ); - State tail = m_fsm.createUniqueState( toBase26( m_stateId ++ ) + "1", true ); - - m_fsm.addTransition( head, symb, tail ); - - return SubexpressionTails( head, tail ); -} - -Thompson::SubexpressionTails Thompson::processRegExpNode( const RegExpEmpty * node ) -{ - State head = m_fsm.createUniqueState( toBase26( m_stateId ) + "0", true ); - State tail = m_fsm.createUniqueState( toBase26( m_stateId ++ ) + "1", true ); - - return SubexpressionTails( head, tail ); -} - -// ---------------------------------------------------------------------------- - -Thompson::SubexpressionTails::SubexpressionTails( const State & head, const State & tail ) : m_head( head ), m_tail ( tail ) -{ - -} - -} /* namespace conversions */ diff --git a/aconversions/src/re2fa/Thompson.h b/aconversions/src/re2fa/Thompson.h deleted file mode 100644 index ae2f5452bb..0000000000 --- a/aconversions/src/re2fa/Thompson.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Thompson.h - * - * Created on: 11. 1. 2014 - * Author: tomas - */ - -#ifndef THOMPSON_H_ -#define THOMPSON_H_ - -#include <set> -#include <vector> - -#include <AlibException.h> -#include <automaton/FSM/FSM.h> -#include <regexp/RegExp.h> -#include <regexp/RegExpElements.h> - -#include "../interface/IConversionFSM.h" -#include "../include/macros.h" -#include "../shared/Hexavigesimal.h" - - -namespace conversions -{ - -/** - * Converts regular expression to finite automaton using Thompson's Construction Algorithm (TCA). - * Sources: - * Hopcroft, section 3.2.3 - * http://www.eecis.udel.edu/~cavazos/cisc672/lectures/Lecture-04.pdf - * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.7450&rep=rep1&type=ps - * Melichar 2.112 - */ -class Thompson : public IConversionFSM -{ -public: - /** - * @param re Source regular expression. - */ - Thompson( const regexp::RegExp & re ); - ~Thompson( void ); - - /** - * Performs conversion. - * @return FSM equivalent to original regular expression. - */ - automaton::FSM convert( void ); - -private: - /** - * input regexp - */ - const regexp::RegExp & m_re; - - /** - * output FSM ($\varepsilon$--NFA) - */ - automaton::FSM m_fsm; - int m_stateId; - - /** - * Stores head and tail state of "subautomaton" created in regexp subtree. - */ - struct SubexpressionTails - { - SubexpressionTails( const automaton::State & head, const automaton::State & tail ); - automaton::State m_head, m_tail; - }; - - SubexpressionTails processRegExpNode( const regexp::RegExpElement * node ); - SubexpressionTails processRegExpNode( const regexp::Alternation * node ); - SubexpressionTails processRegExpNode( const regexp::Concatenation * node ); - SubexpressionTails processRegExpNode( const regexp::Iteration * node ); - SubexpressionTails processRegExpNode( const regexp::RegExpSymbol * node ); - SubexpressionTails processRegExpNode( const regexp::RegExpEmpty * node ); - SubexpressionTails processRegExpNode( const regexp::RegExpEpsilon * node ); -}; - -} /* namespace conversions */ - -#endif /* THOMPSON_H_ */ diff --git a/aconversions2/src/ConversionHandler.cpp b/aconversions2/src/ConversionHandler.cpp index 814185c436..2ad49e893a 100644 --- a/aconversions2/src/ConversionHandler.cpp +++ b/aconversions2/src/ConversionHandler.cpp @@ -11,8 +11,8 @@ #include "conversions/fa2re/BrzozowskiAlgebraic.h" #include "conversions/re2fa/Glushkov.h" -//#include "conversions/re2fa/Thompson.h" -//#include "conversions/re2fa/Brzozowski.h" +#include "conversions/re2fa/Thompson.h" +#include "conversions/re2fa/Brzozowski.h" #include "conversions/fa2rg/fa2lrg/FAtoLRGConverter.h" #include "conversions/fa2rg/fa2rrg/FAtoRRGConverter.h" @@ -173,27 +173,24 @@ void ConversionHandler::convertFSMtoLRG( void ) void ConversionHandler::convertREtoFSM( void ) { - const regexp::UnboundedRegExp regexp = alib::DataFactory::fromTokens<regexp::UnboundedRegExp>( m_tokens ); + const regexp::RegExp regexp = alib::DataFactory::fromTokens<regexp::RegExp>(m_tokens); switch( m_algorithm ) { case BRZOZOWSKI_DERIVATION: { -/* re2fa::Brzozowski conv( regexp ); - automaton::DFA dfa = conv.convert(); - alib::DataFactory::toStdout(dfa);*/ + re2fa::Brzozowski conv; + alib::DataFactory::toStdout(conv.convert(regexp)); break; } case THOMPSON_NFA: { -/* re2fa::Thompson conv( regexp ); - autoamton::EpsilonNFA enfa = conv.convert(); - alib::DataFactory::toStdout(enfa);*/ + re2fa::Thompson conv; + alib::DataFactory::toStdout(conv.convert(regexp)); break; } case GLUSHKOV_NFA: default: { - re2fa::Glushkov conv( regexp ); - automaton::NFA nfa = conv.convert(); - alib::DataFactory::toStdout(nfa); + //re2fa::Glushkov conv; + //alib::DataFactory::toStdout(conv.convert(regexp)); break; } } diff --git a/alib2algo/src/conversions/re2fa/Brzozowski.cpp b/alib2algo/src/conversions/re2fa/Brzozowski.cpp new file mode 100644 index 0000000000..d420f67ab6 --- /dev/null +++ b/alib2algo/src/conversions/re2fa/Brzozowski.cpp @@ -0,0 +1,128 @@ +/* + * Brzozowski.cpp + * + * Created on: 11. 1. 2014 + * Author: tomas + */ + +#include "Brzozowski.h" + +#include <set> +#include <deque> +#include <queue> +#include <vector> + +#include <string/LinearString.h> +#include <std/hexavigesimal.h> +#include <label/StringLabel.h> + +#include "../../regexp/RegExpDerivation.h" +//#include "regexp/RegExpOptimize.h" + +namespace re2fa +{ + +Brzozowski::Brzozowski(void){} +Brzozowski::~Brzozowski(void){} + + +void Brzozowski::Visit(void* userData, const regexp::FormalRegExp& regexp) +{ + std::pair<std::set<alphabet::Symbol>, bool>& out = *(std::pair<std::set<alphabet::Symbol>, bool>*) userData; + out.first = regexp.getAlphabet(); + out.second = regexp.containsEmptyString(); +} +void Brzozowski::Visit(void* userData, const regexp::UnboundedRegExp& regexp) +{ + std::pair<std::set<alphabet::Symbol>, bool>& out = *(std::pair<std::set<alphabet::Symbol>, bool>*) userData; + out.first = regexp.getAlphabet(); + out.second = regexp.containsEmptyString(); +} + +automaton::NFA Brzozowski::convert(const regexp::RegExp& regexp) +{ + // 1. + // regexp::RegExpOptimize opt; + // regexp::RegExp V = opt.optimize(regexp); + regexp::RegExp V = regexp; + + std::pair<std::set<alphabet::Symbol>, bool> out({}, false); + regexp.getData().Accept((void*) &out, *this); + const std::set<alphabet::Symbol>& alphabet = out.first; + + std::set<regexp::RegExp> Q = { V }; + std::deque<std::set<regexp::RegExp>> Qi; + + Qi.push_back(std::set<regexp::RegExp>()); + Qi.at(0).insert(V); + + int i = 1; + + // 2. + while(! Qi.at(i - 1).empty()) + { + Qi.push_back(std::set<regexp::RegExp>()); // initialize set Q_i + + for(const auto& dregexp : Qi.at(i - 1)) + { + regexp::RegExpDerivation deriv; + + for(const auto& a : alphabet) + { + string::LinearString string(std::vector<alphabet::Symbol>{a}); + regexp::RegExp derived = deriv.derivation(dregexp, string); + // derived = opt.optimize(derived); + + // this will also add \emptyset as a regexp (and as FA state) + if(Q.count(derived) == 0) // if this state has already been found, do not add + Qi.at(i).insert(derived); + } + } + + Q.insert(Qi.at(i).begin(), Qi.at(i).end()); + i += 1; + } + + // ------------------------------------------------------------------------ + // 3. + + automaton::NFA automaton; + int stateId = 0; + std::map<regexp::RegExp, automaton::State> stateMap; + + for(const auto& r : Q) + { + automaton::State q(label::Label(label::StringLabel(std::toBase26(stateId++)))); + stateMap.insert(std::make_pair(r, q)); + automaton.addState(q); + } + + automaton.setInputSymbols(alphabet); + + for(const auto& r : Q) + { + regexp::RegExpDerivation deriv; + + for(const auto& a: alphabet) + { + string::LinearString string(std::vector<alphabet::Symbol>{a}); + regexp::RegExp derived = deriv.derivation(r, string); + // derived = opt.optimize(derived); + + automaton.addTransition(stateMap.find(r)->second, a, stateMap.find(derived)->second); + } + } + + automaton.addInitialState(stateMap.find(V)->second); + + for(const auto& r : Q) + { + regexp.getData().Accept((void*) &out, *this); + if(out.second) // if(r.containsEmptyString()) + automaton.addFinalState(stateMap.find(r)->second); + } + + return automaton; +} + +} /* namespace re2fa */ diff --git a/alib2algo/src/conversions/re2fa/Brzozowski.h b/alib2algo/src/conversions/re2fa/Brzozowski.h new file mode 100644 index 0000000000..58d4d0aaa5 --- /dev/null +++ b/alib2algo/src/conversions/re2fa/Brzozowski.h @@ -0,0 +1,42 @@ +/* + * Brzozowski.h + * + * Created on: 11. 1. 2014 + * Author: tomas + */ + +#ifndef BRZOZOWSKI_H_ +#define BRZOZOWSKI_H_ + +#include <regexp/RegExp.h> +#include <regexp/formal/FormalRegExp.h> +#include <regexp/unbounded/UnboundedRegExp.h> +#include <automaton/FSM/NFA.h> + +namespace re2fa +{ + +/** + * Converts regular expression to finite automaton using Brzozowski algorithm (derivations of regular expressions). + * Source: Melichar 2.110 + */ +class Brzozowski : public regexp::VisitableRegExpBase::visitor_type +{ +public: + Brzozowski(void); + ~Brzozowski(void); + + /** + * Performs conversion. + * @return FSM equivalent to original regular expression. + */ + automaton::NFA convert(const regexp::RegExp& regexp); + +private: + void Visit(void* , const regexp::FormalRegExp& regexp); + void Visit(void* , const regexp::UnboundedRegExp& regexp); +}; + +} /* namespace re2fa */ + +#endif /* BRZOZOWSKI_H_ */ diff --git a/alib2algo/src/conversions/re2fa/Thompson.cpp b/alib2algo/src/conversions/re2fa/Thompson.cpp new file mode 100644 index 0000000000..1b3520fe04 --- /dev/null +++ b/alib2algo/src/conversions/re2fa/Thompson.cpp @@ -0,0 +1,261 @@ +/* + * Thompson.cpp + * + * Created on: 11. 1. 2014 + * Author: tomas + */ +#include "Thompson.h" +#include <tuple> +#include <label/Label.h> +#include <label/IntegerLabel.h> + +namespace re2fa +{ + +Thompson::Thompson(void){} +Thompson::~Thompson(void){} + +automaton::EpsilonNFA Thompson::convert(const regexp::RegExp& regexp) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> out(automaton::EpsilonNFA(), 0, nullptr, nullptr); + automaton::EpsilonNFA& automaton = std::get<0>(out); + + regexp.getData().Accept((void*) &out, *this); + + automaton.setInitialStates({*std::get<2>(out)}); + automaton.setFinalStates(std::set<automaton::State>{*std::get<3>(out)}); + + return std::get<0>(out); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExp& regexp) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton.setInputSymbols(regexp.getAlphabet()); + regexp.getRegExp().Accept((void*) &out, *this); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpAlternation& alternation) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + static_cast<const regexp::FormalRegExpElement&>(alternation.getLeftElement()).Accept(userData, *this); + automaton.addTransition(head, string::Epsilon::EPSILON, *std::get<2>(out)); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, tail); + + static_cast<const regexp::FormalRegExpElement&>(alternation.getRightElement()).Accept(userData, *this); + automaton.addTransition(head, string::Epsilon::EPSILON, *std::get<2>(out)); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, tail); + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpConcatenation& concatenation) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + + automaton::EpsilonNFA& automaton = std::get<0>(out); + + static_cast<const regexp::FormalRegExpElement&>(concatenation.getLeftElement()).Accept(userData, *this); + const automaton::State* leftHead = std::get<2>(out); + const automaton::State* leftTail = std::get<3>(out); + + static_cast<const regexp::FormalRegExpElement&>(concatenation.getRightElement()).Accept(userData, *this); + automaton.addTransition(*leftTail, string::Epsilon::EPSILON, *std::get<2>(out)); + + std::get<2>(out) = &(*automaton.getStates().find(*leftHead)); + // std::get<3>(out) = std::get<3>(out); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpIteration& iteration) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + static_cast<const regexp::FormalRegExpElement&>(iteration.getElement()).Accept(userData, *this); + automaton.addTransition(head, string::Epsilon::EPSILON, *std::get<2>(out)); + automaton.addTransition(head, string::Epsilon::EPSILON, tail); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, tail); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, *std::get<2>(out)); + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpSymbol& symbol) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + automaton.addTransition(head, symbol.getSymbol(), tail); + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpEpsilon& epsilon) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + automaton.addTransition(head, string::Epsilon::EPSILON, tail); + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::FormalRegExpEmpty& empty) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExp& regexp) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton.setInputSymbols(regexp.getAlphabet()); + regexp.getRegExp().Accept((void*) &out, *this); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpAlternation& alternation) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + for(const auto& element : alternation.getElements()) + { + static_cast<const regexp::UnboundedRegExpElement&>(*element).Accept(userData, *this); + automaton.addTransition(head, string::Epsilon::EPSILON, *std::get<2>(out)); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, tail); + } + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpConcatenation& concatenation) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + std::vector<std::pair<const automaton::State*, const automaton::State*>> tails; + for(const auto& element : concatenation.getElements()) + { + static_cast<const regexp::UnboundedRegExpElement&>(*element).Accept(userData, *this); + tails.push_back(std::make_pair(std::get<2>(out), std::get<3>(out))); + } + + for(size_t i = 1; i < tails.size(); i++) + automaton.addTransition(*tails[i-1].second, string::Epsilon::EPSILON, *tails[i].first); + + std::get<2>(out) = tails[0].first; + std::get<3>(out) = tails[tails.size()-1].second; +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpIteration& iteration) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + static_cast<const regexp::UnboundedRegExpElement&>(iteration.getElement()).Accept(userData, *this); + automaton.addTransition(head, string::Epsilon::EPSILON, *std::get<2>(out)); + automaton.addTransition(head, string::Epsilon::EPSILON, tail); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, tail); + automaton.addTransition(*std::get<3>(out), string::Epsilon::EPSILON, *std::get<2>(out)); + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpSymbol& symbol) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + automaton.addTransition(head, symbol.getSymbol(), tail); + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpEpsilon& epsilon) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + automaton.addTransition(head, string::Epsilon::EPSILON, tail); + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +void Thompson::Visit(void* userData, const regexp::UnboundedRegExpEmpty& empty) +{ + std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*> &out = *(std::tuple<automaton::EpsilonNFA, int, const automaton::State*, const automaton::State*>*) userData; + automaton::EpsilonNFA& automaton = std::get<0>(out); + + automaton::State head = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton::State tail = automaton::State(label::Label(label::IntegerLabel(std::get<1>(out)++))); + automaton.addState(head); + automaton.addState(tail); + + std::get<2>(out) = &(*automaton.getStates().find(head)); + std::get<3>(out) = &(*automaton.getStates().find(tail)); +} + +} /* namespace re2fa */ diff --git a/alib2algo/src/conversions/re2fa/Thompson.h b/alib2algo/src/conversions/re2fa/Thompson.h new file mode 100644 index 0000000000..ace518cd99 --- /dev/null +++ b/alib2algo/src/conversions/re2fa/Thompson.h @@ -0,0 +1,59 @@ +/* + * Thompson.h + * + * Created on: 11. 1. 2014 + * Author: tomas + */ + +#ifndef THOMPSON_H_ +#define THOMPSON_H_ + +#include <regexp/RegExp.h> +#include <regexp/formal/FormalRegExpElements.h> +#include <regexp/unbounded/UnboundedRegExpElements.h> +#include <automaton/FSM/EpsilonNFA.h> + +namespace re2fa +{ + +/** + * Converts regular expression to finite automaton using Thompson's Construction Algorithm (TCA). + * Sources: + * Hopcroft, section 3.2.3 + * http://www.eecis.udel.edu/~cavazos/cisc672/lectures/Lecture-04.pdf + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.7450&rep=rep1&type=ps + * Melichar 2.112 + */ +class Thompson : public regexp::VisitableRegExpBase::visitor_type, regexp::FormalRegExpElement::visitor_type, regexp::UnboundedRegExpElement::visitor_type +{ +public: + /** + * Performs conversion. + * @return nondeterministic finite automaton with epsilon transitions accepting language described by the regexp + */ + Thompson(void); + ~Thompson(void); + automaton::EpsilonNFA convert(const regexp::RegExp& regexp); + +private: + void Visit(void*, const regexp::UnboundedRegExp& regexp); + void Visit(void*, const regexp::FormalRegExp& regexp); + + void Visit(void*, const regexp::UnboundedRegExpAlternation& alternation); + void Visit(void*, const regexp::UnboundedRegExpConcatenation& concatenation); + void Visit(void*, const regexp::UnboundedRegExpIteration& iteration); + void Visit(void*, const regexp::UnboundedRegExpSymbol& symbol); + void Visit(void*, const regexp::UnboundedRegExpEpsilon& epsilon); + void Visit(void*, const regexp::UnboundedRegExpEmpty& empty); + + void Visit(void*, const regexp::FormalRegExpAlternation& alternation); + void Visit(void*, const regexp::FormalRegExpConcatenation& concatenation); + void Visit(void*, const regexp::FormalRegExpIteration& iteration); + void Visit(void*, const regexp::FormalRegExpSymbol& symbol); + void Visit(void*, const regexp::FormalRegExpEpsilon& epsilon); + void Visit(void*, const regexp::FormalRegExpEmpty& empty); +}; + +} /* namespace re2fa */ + +#endif /* THOMPSON_H_ */ -- GitLab