diff --git a/aconversions/src/conversions/re2fa/Thompson.cpp b/aconversions/src/conversions/re2fa/Thompson.cpp index 3b606f8b5f836f30f9c93568f6759018299e73b7..4cdb865a9bb613539ff79f3f3a70f61888a44ca4 100644 --- a/aconversions/src/conversions/re2fa/Thompson.cpp +++ b/aconversions/src/conversions/re2fa/Thompson.cpp @@ -4,9 +4,140 @@ * Created on: 11. 1. 2014 * Author: tomas */ +#include <iostream> #include "Thompson.h" +using namespace automaton; +using namespace regexp; + namespace conversions { +Thompson::Thompson( const RegExp & re ) : AbstractREtoFAConverter( re ) +{ + +} + +const FSM Thompson::convert( void ) +{ + //FIXME in alib! + RegExp& re = const_cast<RegExp&>(m_re); + RegExpElement* treeRoot = re.getRegExp(); + + SubexpressionTails st = process( treeRoot ); + m_fsm.addInitialState( st.m_head ); + m_fsm.addFinalState( st.m_tail ); + + return m_fsm; +} + +Thompson::SubexpressionTails Thompson::process( RegExpElement * element ) +{ + Alternation* alternation = dynamic_cast<Alternation*>(element); + Concatenation* concatenation = dynamic_cast<Concatenation*>(element); + Iteration* iteration = dynamic_cast<Iteration*>(element); + RegExpSymbol* symbol = dynamic_cast<RegExpSymbol*>(element); + + if( alternation ) + return processAlternation( alternation ); + else if( concatenation ) + return processConcatenation( concatenation ); + else if( iteration ) + return processIteration( iteration ); + else if( symbol ) + return processSymbol( symbol ); + + throw ConversionException( "Captain's log. Stardate 3413.6. Encountered invalid RegExpElement. Sending away team to explore." ); +} + +Thompson::SubexpressionTails Thompson::processIteration( Iteration * iteration ) +{ + State head = AutomatonUtils::createUniqueState( "iter__head", m_fsm.getStates() ); + State tail = AutomatonUtils::createUniqueState( "iter__tail", m_fsm.getStates() ); + m_fsm.addState( head ); + m_fsm.addState( tail ); + + SubexpressionTails st = process( iteration->getElement() ); + + m_fsm.addTransition( head, Symbol( "" ), st.m_head ); + m_fsm.addTransition( head, Symbol( "" ), tail ); + m_fsm.addTransition( st.m_tail, Symbol( "" ), tail ); + m_fsm.addTransition( st.m_tail, Symbol( "" ), st.m_head ); + + return SubexpressionTails( head, tail ); +} + +Thompson::SubexpressionTails Thompson::processAlternation( Alternation * alternation ) +{ + State head = AutomatonUtils::createUniqueState( "alt__head", m_fsm.getStates() ); + State tail = AutomatonUtils::createUniqueState( "alt__tail", m_fsm.getStates() ); + m_fsm.addState( head ); + m_fsm.addState( tail ); + + for( auto element : alternation->getElements() ) + { + SubexpressionTails st = process( element ); + m_fsm.addTransition( head, Symbol(""), st.m_head ); + m_fsm.addTransition( st.m_tail, Symbol(""), tail ); + } + + return SubexpressionTails( head, tail ); +} + +Thompson::SubexpressionTails Thompson::processConcatenation( Concatenation * concatenation ) +{ + // TODO: why does Alib::automaton::state has no op= ? => FIXME memallocs + + State * tail = NULL, * head = NULL; + + for( auto element : concatenation->getElements() ) + { + SubexpressionTails st = process( element ); + + if( head ) + m_fsm.addTransition( * tail, Symbol( "" ), st.m_head ); + else + head = new State( st.m_head.getName() ); + + delete tail; + tail = new State( st.m_tail.getName() ); + } + + State h( head->getName() ), t( tail->getName() ); + delete head; + delete tail; + + return SubexpressionTails( h, t ); +} + +Thompson::SubexpressionTails Thompson::processSymbol( RegExpSymbol * symbol ) +{ + Symbol symb( symbol->getSymbol() ); + State head = AutomatonUtils::createUniqueState( "sym__start", m_fsm.getStates() ); + State tail = AutomatonUtils::createUniqueState( "sym__end", m_fsm.getStates() ); + + try + { + m_fsm.addInputSymbol( symb ); + } + catch( AutomatonException & e ) + { + // do nothing. + } + + m_fsm.addState( head ); + m_fsm.addState( tail ); + m_fsm.addTransition( head, symb, tail ); + + return SubexpressionTails( head, tail ); +} + +// ---------------------------------------------------------------------------- + +Thompson::SubexpressionTails::SubexpressionTails( State & head, State & tail ) : m_head( head.getName() ), m_tail( tail.getName() ) +{ + +} + + } /* namespace conversions */ diff --git a/aconversions/src/conversions/re2fa/Thompson.h b/aconversions/src/conversions/re2fa/Thompson.h index 0311c23f46930856308e6951e4ff262a94ab8974..816b2813455bf1f7c77f76e42110fe389e0d46cd 100644 --- a/aconversions/src/conversions/re2fa/Thompson.h +++ b/aconversions/src/conversions/re2fa/Thompson.h @@ -8,11 +8,51 @@ #ifndef THOMPSON_H_ #define THOMPSON_H_ +#include <automaton/FSM/FSM.h> + +#include <regexp/RegExp.h> +#include <regexp/RegExpElement.h> +#include <regexp/Alternation.h> +#include <regexp/Concatenation.h> +#include <regexp/Iteration.h> +#include <regexp/RegExpSymbol.h> + +#include "../../utils/AutomatonUtils.h" +#include "../../utils/ConversionException.h" + +#include "AbstractREtoFAConverter.h" + namespace conversions { +/** + * Converts regular expression to finite automata using Thompson's Construction Algorithm (TCA). + * Sources: + * Hopcroft, section 3.2.3 + * http://www.eecis.udel.edu/~cavazos/cisc672/lectures/Lecture-04.pdf + */ class Thompson : public AbstractREtoFAConverter { +public: + Thompson( const regexp::RegExp & re ); + const automaton::FSM convert( void ); + +private: + /** + * Represents head and tail state of subexpression returned from subtree. + */ + struct SubexpressionTails + { + SubexpressionTails( automaton::State & head, automaton::State & tail ); + automaton::State m_head, m_tail; // references will get out of scope. Alib implements op==, so no problem with this. only little overhead + }; + + SubexpressionTails process( regexp::RegExpElement * element ); + SubexpressionTails processAlternation( regexp::Alternation * alternation ); + SubexpressionTails processConcatenation( regexp::Concatenation * concatenation ); + SubexpressionTails processIteration( regexp::Iteration * iteration ); + SubexpressionTails processSymbol( regexp::RegExpSymbol * symbol ); +}; } /* namespace conversions */