From 13a7dd525d64f45123810680e530081b1ba215a2 Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Sat, 6 Sep 2014 17:57:58 +0200 Subject: [PATCH] regexp optimize --- .../conversions/fa2re/BrzozowskiAlgebraic.cpp | 2 +- .../equations/LeftRegularEquationSolver.cpp | 8 +- .../src/equations/LeftRegularEquationSolver.h | 2 +- .../src/equations/RegularEquationSolver.cpp | 5 +- .../src/equations/RegularEquationSolver.h | 2 +- .../equations/RightRegularEquationSolver.cpp | 12 +- .../equations/RightRegularEquationSolver.h | 2 +- alib2algo/src/regexp/RegExpOptimize.cpp | 1208 +++++++++++++++++ alib2algo/src/regexp/RegExpOptimize.h | 99 ++ .../test-src/regexp/RegExpOptimizeTest.cpp | 42 + .../test-src/regexp/RegExpOptimizeTest.h | 19 + .../unbounded/UnboundedRegExpAlternation.h | 4 + .../unbounded/UnboundedRegExpConcatenation.h | 4 + .../unbounded/UnboundedRegExpIteration.h | 4 + 14 files changed, 1397 insertions(+), 16 deletions(-) create mode 100644 alib2algo/src/regexp/RegExpOptimize.cpp create mode 100644 alib2algo/src/regexp/RegExpOptimize.h create mode 100644 alib2algo/test-src/regexp/RegExpOptimizeTest.cpp create mode 100644 alib2algo/test-src/regexp/RegExpOptimizeTest.h diff --git a/alib2algo/src/conversions/fa2re/BrzozowskiAlgebraic.cpp b/alib2algo/src/conversions/fa2re/BrzozowskiAlgebraic.cpp index 1098a33dc8..f336c3f2a7 100644 --- a/alib2algo/src/conversions/fa2re/BrzozowskiAlgebraic.cpp +++ b/alib2algo/src/conversions/fa2re/BrzozowskiAlgebraic.cpp @@ -60,7 +60,7 @@ regexp::UnboundedRegExp BrzozowskiAlgebraic::convert( const automaton::DFA & aut solver.addEquation( alphabet::Symbol( alphabet::LabeledSymbol ( p.first.first.getName() ) ), alphabet::Symbol( alphabet::LabeledSymbol( p.second.getName() ) ), regexp::UnboundedRegExpSymbol { p.first.second } ); } - return regexp::UnboundedRegExp { solver.solve( alphabet::Symbol( alphabet::LabeledSymbol (automaton.getInitialState().getName() ) ) ).getRegExp() }; + return solver.solve( alphabet::Symbol( alphabet::LabeledSymbol (automaton.getInitialState().getName() ) ) ); } } /* namespace fa2re */ diff --git a/alib2algo/src/equations/LeftRegularEquationSolver.cpp b/alib2algo/src/equations/LeftRegularEquationSolver.cpp index 0d57d010ac..6d404df1d3 100644 --- a/alib2algo/src/equations/LeftRegularEquationSolver.cpp +++ b/alib2algo/src/equations/LeftRegularEquationSolver.cpp @@ -8,10 +8,12 @@ #include "LeftRegularEquationSolver.h" #include "regexp/unbounded/UnboundedRegExpElements.h" +#include "../regexp/RegExpOptimize.h" + namespace equations { -regexp::UnboundedRegExpElement* LeftRegularEquationSolver::eliminate( void ) { - // RegExpOptimize opt; // TODO uncomment when implemented +regexp::UnboundedRegExp LeftRegularEquationSolver::eliminate( void ) { + regexp::RegExpOptimize opt; for( auto itA = m_symbolsByDepth.rbegin( ); itA != m_symbolsByDepth.rend( ); itA ++ ) { const alphabet::Symbol & a = * itA; @@ -69,7 +71,7 @@ regexp::UnboundedRegExpElement* LeftRegularEquationSolver::eliminate( void ) { } } - return m_eqFinal[ * m_symbolsByDepth.begin( ) ].clone(); + return opt.optimize( regexp::UnboundedRegExp( std::move ( m_eqFinal[ * m_symbolsByDepth.begin( ) ] ) ) ); } } /* namespace equations */ diff --git a/alib2algo/src/equations/LeftRegularEquationSolver.h b/alib2algo/src/equations/LeftRegularEquationSolver.h index ff2917322e..d5f93780eb 100644 --- a/alib2algo/src/equations/LeftRegularEquationSolver.h +++ b/alib2algo/src/equations/LeftRegularEquationSolver.h @@ -16,7 +16,7 @@ class LeftRegularEquationSolver : public RegularEquationSolver { /** * @copydoc RegularEquationSolver::eliminate(void) */ - virtual regexp::UnboundedRegExpElement* eliminate( void ); + virtual regexp::UnboundedRegExp eliminate( void ); }; diff --git a/alib2algo/src/equations/RegularEquationSolver.cpp b/alib2algo/src/equations/RegularEquationSolver.cpp index 44eaf65582..74f8484fc9 100644 --- a/alib2algo/src/equations/RegularEquationSolver.cpp +++ b/alib2algo/src/equations/RegularEquationSolver.cpp @@ -22,10 +22,7 @@ regexp::UnboundedRegExp RegularEquationSolver::solve( const alphabet::Symbol & s */ symbolsByDepth( solveFor ); - regexp::UnboundedRegExpElement* result = eliminate( ); - - regexp::UnboundedRegExp ret( *result ); - return ret; + return eliminate( ); } void RegularEquationSolver::addSymbol( const alphabet::Symbol & symb ) { diff --git a/alib2algo/src/equations/RegularEquationSolver.h b/alib2algo/src/equations/RegularEquationSolver.h index 9c551075bf..9911783a3d 100644 --- a/alib2algo/src/equations/RegularEquationSolver.h +++ b/alib2algo/src/equations/RegularEquationSolver.h @@ -65,7 +65,7 @@ protected: * actual equations elimination * @return pointer to solutions RegExp tree root */ - virtual regexp::UnboundedRegExpElement* eliminate( void ) = 0; + virtual regexp::UnboundedRegExp eliminate( void ) = 0; /** * Runs BFS to determine depth of symbols in equation system and stores it in m_symbolsByDepth; diff --git a/alib2algo/src/equations/RightRegularEquationSolver.cpp b/alib2algo/src/equations/RightRegularEquationSolver.cpp index 21403a1db1..4a1ecbe5ae 100644 --- a/alib2algo/src/equations/RightRegularEquationSolver.cpp +++ b/alib2algo/src/equations/RightRegularEquationSolver.cpp @@ -8,10 +8,12 @@ #include "RightRegularEquationSolver.h" #include "regexp/unbounded/UnboundedRegExpElements.h" +#include "../regexp/RegExpOptimize.h" + namespace equations { -regexp::UnboundedRegExpElement* RightRegularEquationSolver::eliminate( void ) { - // RegExpOptimize opt; // TODO uncomment when implemented +regexp::UnboundedRegExp RightRegularEquationSolver::eliminate( void ) { + regexp::RegExpOptimize opt; for( auto itA = m_symbolsByDepth.rbegin( ); itA != m_symbolsByDepth.rend( ); itA ++ ) { const alphabet::Symbol & a = * itA; @@ -56,7 +58,7 @@ regexp::UnboundedRegExpElement* RightRegularEquationSolver::eliminate( void ) { regexp::UnboundedRegExpAlternation alt; alt.appendElement( std::move( m_eqTransition[ b ][ c ] ) ); alt.appendElement( std::move( concat ) ); - m_eqTransition[ b ][ c ] = /* opt.optimize( */ std::move( alt ) /* ) */; + m_eqTransition[ b ][ c ] = std::move( alt ); } regexp::UnboundedRegExpConcatenation concat; @@ -65,11 +67,11 @@ regexp::UnboundedRegExpElement* RightRegularEquationSolver::eliminate( void ) { regexp::UnboundedRegExpAlternation alt; alt.appendElement( std::move( m_eqFinal[ b ] ) ); alt.appendElement( std::move( concat ) ); - m_eqFinal[ b ] = /* opt.optimize( */ std::move( alt ) /* ) */; + m_eqFinal[ b ] = std::move( alt ); } } - return m_eqFinal[ * m_symbolsByDepth.begin( ) ].clone(); + return opt.optimize( regexp::UnboundedRegExp( std::move ( m_eqFinal[ * m_symbolsByDepth.begin( ) ] ) ) ); } } /* namespace equations */ diff --git a/alib2algo/src/equations/RightRegularEquationSolver.h b/alib2algo/src/equations/RightRegularEquationSolver.h index 5785e6b12a..dffbedfc4e 100644 --- a/alib2algo/src/equations/RightRegularEquationSolver.h +++ b/alib2algo/src/equations/RightRegularEquationSolver.h @@ -16,7 +16,7 @@ class RightRegularEquationSolver : public RegularEquationSolver { /** * @copydoc RegularEquationSolver::eliminate(void) */ - virtual regexp::UnboundedRegExpElement* eliminate( void ); + virtual regexp::UnboundedRegExp eliminate( void ); }; diff --git a/alib2algo/src/regexp/RegExpOptimize.cpp b/alib2algo/src/regexp/RegExpOptimize.cpp new file mode 100644 index 0000000000..0d247c7493 --- /dev/null +++ b/alib2algo/src/regexp/RegExpOptimize.cpp @@ -0,0 +1,1208 @@ +/* + * RegExpOptimize.cpp + * + * Created on: 20. 1. 2014 + * Author: Tomas Pecka + */ + +#include "RegExpOptimize.h" + +#include <cassert> +#include <iostream> +#include <iostream> + +namespace regexp { + +UnboundedRegExp RegExpOptimize::optimize( UnboundedRegExp const & regexp ) +{ + UnboundedRegExpElement* optimized = optimize( & regexp.getRegExp( ) ); + + UnboundedRegExp ret( std::move( * optimized ) ); + + delete optimized; + + return ret; +} + +UnboundedRegExpElement* RegExpOptimize::optimize( UnboundedRegExpElement const * const & node ) +{ + const UnboundedRegExpAlternation * alternation = dynamic_cast<const UnboundedRegExpAlternation*>( node ); + if( alternation ) + return optimize( alternation ); + + const UnboundedRegExpConcatenation * concatenation = dynamic_cast<const UnboundedRegExpConcatenation*>( node ); + if( concatenation ) + return optimize( concatenation ); + + const UnboundedRegExpIteration * iteration = dynamic_cast<const UnboundedRegExpIteration*>( node ); + if( iteration ) + return optimize( iteration ); + + const UnboundedRegExpSymbol * symbol = dynamic_cast<const UnboundedRegExpSymbol*>( node ); + if( symbol ) + return optimize( symbol ); + + const UnboundedRegExpEmpty * empty= dynamic_cast<const UnboundedRegExpEmpty*>( node ); + if( empty ) + return optimize( empty ); + + const UnboundedRegExpEpsilon * eps = dynamic_cast<const UnboundedRegExpEpsilon*>( node ); + if( eps ) + return optimize( eps ); + + throw exception::AlibException( "RegExpOptimize::optimize - unknown UnboundedRegExpElement node" ); +} + + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpAlternation const * const & node ) +{ + UnboundedRegExpAlternation* alt = new UnboundedRegExpAlternation( ); + + for( const auto & child : node->elements ) + alt->elements.push_back( optimize( child ) ); + + // optimize while you can + while( A1( alt ) || A2( alt ) || A3( alt ) || A4( alt ) || A10( alt ) || V2( alt ) || V5( alt ) || V6( alt ) || X1( alt ) ); + + if( alt->elements.size( ) == 1 ) + { + UnboundedRegExpElement* ret = alt->elements.front( ); + alt->elements.clear( ); + delete alt; + return ret; + } + + if( alt->elements.size( ) == 0 ) { + delete alt; + return new UnboundedRegExpEmpty( ); + } + + return alt; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpConcatenation const * const & node ) +{ + UnboundedRegExpConcatenation* concat = new UnboundedRegExpConcatenation( ); + + for( const auto & child : node->elements ) + concat->elements.push_back( optimize( child ) ); + + do + { + // A7 is implemented here ~ if not here, it went into infinite loop FIXME + if( std::any_of( concat->elements.begin( ), concat->elements.end( ), []( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpEmpty const *>( a ); } ) ) + { + delete concat; + return new UnboundedRegExpEmpty( ); + } + } + while( A5( concat ) || A6( concat ) || /* A7( concat ) || */ A8( concat ) || A9( concat ) || V8( concat ) );//|| V9( concat ) ); + + if( concat->elements.size( ) == 1 ) + { + UnboundedRegExpElement* ret = concat->elements.front( ); + concat->elements.clear( ); + delete concat; + return ret; + } + + if( concat->elements.size( ) == 0 ) { + delete concat; + return new UnboundedRegExpEpsilon( ); + } + + return concat; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpIteration const * const & node ) +{ + UnboundedRegExpIteration* iter = new UnboundedRegExpIteration(* optimize( node->element ) ); + + do + { + // V1 is implemented right here + if( dynamic_cast<UnboundedRegExpEmpty*>( iter->element ) ) + { + delete iter; + return new UnboundedRegExpEpsilon( ); + } + } + while( A11( iter ) || V1( iter ) || V3( iter ) || V4( iter ) || V10( iter ) ); + + return iter; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpSymbol const * const & node ) +{ + return node->clone( ); +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEmpty const * const & node ) +{ + return node->clone( ); +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEpsilon const * const & node ) +{ + return node->clone( ); +} + +// ---------------------------------------------------------------------------- + +/** + * optimization A1: x + ( y + z ) = ( x + y ) + z = x + y + z + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A1( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpAlternation * const & childUnboundedRegExpAlternation = dynamic_cast<UnboundedRegExpAlternation *>( * it ); + + if( childUnboundedRegExpAlternation ) + { + it = node->elements.erase( it ); + + size_t off = it - node->elements.begin(); + node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); + it = node->elements.begin() + off; + + //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); + + childUnboundedRegExpAlternation->elements.clear( ); + delete childUnboundedRegExpAlternation; + + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A2: x + y = y + x (sort) + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A2( UnboundedRegExpAlternation * const & node ) +{ + std::function<bool( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b )> cmp = [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a < *b; }; + + if( std::is_sorted( node->elements.begin( ), node->elements.end( ), cmp ) ) + return false; + + std::sort( node->elements.begin(), node->elements.end(), cmp ); + return true; +} + +/** + * optimization A3: x + \0 = x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A3( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // input can be \0 + \0, so at least one element must be preserved + + // FIXME: alib2 uses shared_ptrs, rewrite this using remove_if then + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpEmpty const * const & empty = dynamic_cast<UnboundedRegExpEmpty const *>( * it ); + + if( empty && node->elements.size( ) > 1 ) + { + it = node->elements.erase( it ); + delete empty; + + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A4: x + x = x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A4( UnboundedRegExpAlternation * const & node ) +{ + /* + * two ways of implementing this opitimization: + * - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts... + * - check every element against other ( O(n*n) ) + * + * As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2 + */ + + // uncomment if smart ptrs used + // node->elements.unique( [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { + // return *a == *b; + // } ); + + bool optimized = false; + if(node->elements.size() != 0) for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + if ( ** it == ** std::prev( it ) ) + { + delete * it; + it = node->elements.erase( it ); + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A5: x.(y.z) = (x.y).z = x.y.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A5( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * const & childUnboundedRegExpConcatenation = dynamic_cast<UnboundedRegExpConcatenation *>( * it ); + + if( childUnboundedRegExpConcatenation ) + { + it = node->elements.erase( it ); + + size_t off = it - node->elements.begin(); + node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); + it = node->elements.begin() + off; + + //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); + + childUnboundedRegExpConcatenation->elements.clear( ); + delete childUnboundedRegExpConcatenation; + + optimized = true; + } + else + it ++; + } + + return optimized; +} + +/** + * optimization A6: \e.x = x.\e = x + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A6( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // FIXME: alib2 uses shared_ptrs, rewrite this using remove_if then + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpEpsilon* epsilon = dynamic_cast<UnboundedRegExpEpsilon*>( * it ); + if( epsilon && node->elements.size( ) > 1 ) + { + delete * it; + it = node->elements.erase( it ); + + optimized = true; + } + else + it ++; + } + + return optimized; +} + +/** + * optimization A7: \0.x = x.\0 = \0 + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A7( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // FIXME: alib2 uses shared_ptrs, rewrite this using remove_if then + + if( std::any_of( node->elements.begin( ), node->elements.end( ), []( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpEmpty const *>( a ); } ) ) + { + for( auto const& child : node->elements ) + delete child; + + node->elements.clear( ); + node->elements.push_back( new UnboundedRegExpEmpty( ) ); + + optimized = true; + } + + return optimized; +} + +/** + * optimization A8: x.(y+z) = x.y + x.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A8( UnboundedRegExpConcatenation * const & node ) +{ +/* + bool optimized = false; + + for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); + if( ! alt ) + { + it ++; + continue; + } + + // take everything to the left and copy it as prefix of every element in alternation. + UnboundedRegExpConcatenation * leftPart = new UnboundedRegExpConcatenation( ); + leftPart->elements.insert( leftPart->elements.end( ), node->elements.begin( ), it ); + + for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) + { + UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); + altElem->elements.push_back( leftPart->clone( ) ); + altElem->elements.push_back( * altIt ); + + * altIt = altElem; + } + + UnboundedRegExpElement * optIt = optimize( * it ); + delete *it; + *it = optIt; + + delete leftPart; + it = node->elements.erase( node->elements.begin( ), it ); + + optimized = true; + it ++; + } + + return optimized; +*/ + return false; +} + +/** + * optimization A9: (x+y).z = x.z + y.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A9( UnboundedRegExpConcatenation * const & node ) +{ +/* + bool optimized = false; + + for( auto it = node->elements.begin( ); it != std::prev( node->elements.end( ) ); ) + { + UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); + if( ! alt ) + { + it ++; + continue; + } + + // take everything to the right and copy it as suffix of every element in alternation. + UnboundedRegExpConcatenation * rest = new UnboundedRegExpConcatenation( ); + rest->elements.insert( rest->elements.end( ), std::next( it ), node->elements.end( ) ); + + for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) + { + UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); + altElem->elements.push_back( * altIt ); + altElem->elements.push_back( rest->clone( ) ); + + * altIt = altElem; + } + + UnboundedRegExpElement * optIt = optimize( * it ); + delete *it; + *it = optIt; + + delete rest; + it = node->elements.erase( std::next( it ), node->elements.end( ) ); + optimized = true; + + // as we move (delete) the rest of this expression, it surely wont do another round. More optimizations to be performerd are in subtree now. + // we do not care about this here as method optimize(UnboundedRegExpAlternation) will take care of this in next iteration + // it ++; + break; + } + + return optimized; +*/ + return false; +} + +/** + * optimization A10: x* = \e + x*x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A10( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false, optimizedIter = false; + + /* + * problem: + * - \e + x*x = x* + * - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node). + */ + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + optimizedIter = false; + + // check if we have some epsilon or iteration left, else nothing to do + auto eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ) || dynamic_cast<UnboundedRegExpIteration const*>( a ); + }); + if( eps == node->elements.end( ) ) + break; + + UnboundedRegExpConcatenation const * const & childConcat = dynamic_cast<UnboundedRegExpConcatenation const *>( *it ); + if( childConcat ) + { + // if iteration is first element of concatenation + UnboundedRegExpIteration const * const & iter = dynamic_cast<UnboundedRegExpIteration const *>( childConcat->elements.front( ) ); + + if( iter ) + { + // concatenation without the iteration node + UnboundedRegExpConcatenation *tmpConcat = dynamic_cast<UnboundedRegExpConcatenation *>( childConcat->clone( ) ); + delete tmpConcat->elements.front( ); + tmpConcat->elements.erase( tmpConcat->elements.begin( ) ); + UnboundedRegExpElement * tmpConcatOpt = optimize( tmpConcat ); + + // check if iteration element is the same subtree as rest of concatenation + if( * iter->element == * tmpConcatOpt ) + { + optimized = optimizedIter = true; + + node->elements.push_back( iter->clone( ) ); + + delete childConcat; + it = node->elements.erase( it ); + + // find the eps again - invalidated after prev erase + eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); + }); + // if it was eps, delete it + // if it was not the eps but iteration, keep it + if( eps != node->elements.end( ) ) + { + delete *eps; + it = node->elements.erase( eps ); + } + } + delete tmpConcat; + delete tmpConcatOpt; + } + } + + if( ! optimizedIter ) + it ++; + } + + return optimized; +} + +/** + * optimization A11: x* = (\e + x)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A11( UnboundedRegExpIteration * const & node ) +{ + bool optimized = false; + + UnboundedRegExpAlternation * const & childAlt = dynamic_cast<UnboundedRegExpAlternation *>( node->element ); + + if( childAlt ) + { + // check if eps inside iteration's alternation + auto eps = find_if( childAlt->elements.begin( ), childAlt->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); + }); + + // if no eps + if( eps == childAlt->elements.end( ) ) + return false; + + // remove eps from alternation + optimized = true; + delete * eps; + childAlt->elements.erase( eps ); + } + + return optimized; +} + +/** + * optimization V1: \0* = \e + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V1( UnboundedRegExpIteration * const & node ) +{ + // implemented in optimize( UnboundedRegExpIteration ) + + return false; +} + +/** + * optimization V2: x* + x = x* + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V2( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + /* + * Bit tricky + * We need also to cover the cases like (a+b)* + a + b + c = (a+b)* + c + */ + + std::list<UnboundedRegExpElement*> iterElements; + // cache iter elements because of operator invalidation after erase + for( const auto & n : node->elements ) + { + UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + if( iter ) + iterElements.push_back( iter->element ); + } + + for( const auto & n : iterElements ) + { + // if alternation is inside, we need to make sure that every element of alternation is inside node->elements. if so, delete them all + UnboundedRegExpAlternation * tmpAlt = dynamic_cast<UnboundedRegExpAlternation*>( n ); + if( tmpAlt ) + { + bool every = true; + for( const auto & altElem : tmpAlt->elements ) + { + auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { + return *a == *altElem; + }); + + if( it == node->elements.end( ) ) + every = false; + } + + if ( every == true ) + { + optimized = true; + + for( const auto & altElem : tmpAlt->elements ) + { + auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { + return *a == *altElem; + }); + assert( it != node->elements.end( ) ); + + delete *it; + node->elements.erase( it ); + } + } + } + + // else + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + if( *n == **it ) + { + optimized = true; + + delete *it; + it = node->elements.erase( it ); + } + else + { + it ++; + } + } + } + + return optimized; +} + +/** + * optimization V3: x** = x* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V3( UnboundedRegExpIteration * const & node ) +{ + UnboundedRegExpIteration* childIter = dynamic_cast<UnboundedRegExpIteration*>( node->element ); + if( childIter ) + { + node->element = childIter->element; + childIter->element = NULL; + delete childIter; + + return true; + } + + return false; +} + +/** + * optimization V4: (x+y)* = (x*y*)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V4( UnboundedRegExpIteration * const & node ) +{ + // interpretation: if iteration's element is concat and every concat's element is iteration + UnboundedRegExpConcatenation* alt = dynamic_cast<UnboundedRegExpConcatenation*>( node->element ); + if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) + return false; + + UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); + + for( const auto & n : alt->elements ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + newAlt->elements.push_back( iter->element ); + iter->element = NULL; + } + + node->element = optimize( newAlt ); + delete alt; + delete newAlt; + + return true; +} + +/** + * optimization V5: x*y = y + x*xy + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V5( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // reinterpretation: ax*y = ay+ax*xy + // so, if we find iter, a = everything that is before it (prefix) + // x = iter's content + // behind iter must be exactly iter's content + // y = rest (suffix) + // prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix + + for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); + if( ! concat ) + { + itA ++; + continue; + } + + for( auto itC = concat->elements.begin( ); itC != std::prev( concat->elements.end( ) ); ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( *itC ); + if( ! iter ) + { + itC ++; + continue; + } + + // iteration's element must follow the iteration (x*x) + auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy + + // if iter's element is concat + if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) + { + UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + // std::cout << "....." << std::endl; + // std::cout << RegExp( concat ) << std::endl; + // std::cout << RegExp( iterConcat ) << std::endl; + // UnboundedRegExpConcatenation * tmp = new UnboundedRegExpConcatenation( ); + // tmp->elements.insert( tmp->elements.end( ), std::next( itC ), concat->elements.end( ) ); + // std::cout << RegExp( tmp) << std::endl; + + if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( std::next( itC ), concat->elements.end( ) ) + || ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), std::next( itC ), + [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) + { + itC++; + continue; + } + advance( itStartY, (int)iterConcat->elements.size( ) ); + } + // else + else + { + if( * iter->element != ** std::next( itC ) ) + { + itC ++; + continue; + } + + advance( itStartY, 1 ); + } + + // store everything before iteration as "a" + UnboundedRegExpElement * regexpA; + if( concat->elements.begin( ) == itC ) + { + regexpA = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation * tmpA = new UnboundedRegExpConcatenation( ); + tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itC ); + regexpA = optimize( tmpA ); + tmpA->elements.clear( ); + delete tmpA; + } + + // store everything behind iteration's followup element as "y" + UnboundedRegExpElement * regexpY; + if( itStartY == concat->elements.end( ) ) + { + regexpY = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); + tmpY->elements.insert( tmpY->elements.end( ), itStartY, concat->elements.end( ) ); + regexpY = optimize( tmpY ); + tmpY->elements.clear( ); + delete tmpY; + } + + // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) + UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); + tmpAY->elements.push_back( regexpA ); + tmpAY->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAY = optimize( tmpAY ); + tmpAY->elements.clear( ); + delete tmpAY; + + auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); + if( iterAY == node->elements.end( ) ) + { + itC ++; + continue; + } + + // if AY exists, then we can simply do this: + //iterator invalidated, need to backup concat node + UnboundedRegExpElement * tmpItA = *itA; + + delete *iterAY; + node->elements.erase( iterAY ); + + // iterator invalidated, need to recall before erase + itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); + + UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); + tmpAltered->elements.push_back( regexpA ); + tmpAltered->elements.push_back( * itC ); + tmpAltered->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); + + tmpAltered->elements.clear( ); + delete tmpAltered; + + delete regexpA; + delete regexpY; + delete regexpAY; + + delete *itA; + itA = node->elements.erase( itA ); + + node->elements.insert( itA, regexpAltered ); + + optimized = true; + break; + } + + itA ++; + } + + return optimized; +} + +/** + * optimization V6: x*y = y + xx*y + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V6( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // reinterpretation: ax*y = ay+axx*y + // so, if we find iter, a = everything that is before it (prefix) + // x = iter's content + // before iter must be exactly iter's content + // y = rest (suffix) + // prefix.xx*.suffix + prefix.suffix = prefix.x*.suffix + + for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); + if( ! concat ) + { + itA ++; + continue; + } + + for( auto itC = std::next( concat->elements.begin( ) ); itC != concat->elements.end( ); ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * itC ); + if( ! iter ) + { + itC ++; + continue; + } + + // iteration's element must preceed the iteration (xx*) + auto itStartX = itC; //itStartX points to first x in expression xx*, everything before is therefore prefix - regexp "a" + + // if iter's element is concat + if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) + { + UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + if( distance( concat->elements.begin( ), itC ) < (int)iterConcat->elements.size( ) ) + { + itC ++; + continue; + } + advance( itStartX, - (int)(iterConcat->elements.size( ) ) ); + + if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( itStartX, concat->elements.end( ) ) + || + ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), itStartX, + []( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) + { + itC++; + continue; + } + } + // else + else + { + if( * iter->element != ** std::prev( itC ) ) + { + itC ++; + continue; + } + + advance( itStartX, -1 ); + } + + // store everything before x as "a" + UnboundedRegExpElement * regexpA; + if( concat->elements.begin( ) == itStartX ) + { + regexpA = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpA = new UnboundedRegExpConcatenation( ); + tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itStartX ); + regexpA = optimize( tmpA ); + tmpA->elements.clear( ); + delete tmpA; + } + + // store everything behind iteration's followup element as "y" + UnboundedRegExpElement * regexpY; + if( std::next( itC ) == concat->elements.end( ) ) + { + regexpY = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); + tmpY->elements.insert( tmpY->elements.end( ), std::next( itC ), concat->elements.end( ) ); + regexpY = optimize( tmpY ); + tmpY->elements.clear( ); + delete tmpY; + } + + // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) + UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); + tmpAY->elements.push_back( regexpA ); + tmpAY->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAY = optimize( tmpAY ); + tmpAY->elements.clear( ); + delete tmpAY; + + auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); + if( iterAY == node->elements.end( ) ) + { + itC ++; + continue; + } + + // if AY exists, then we can simply do this: + //iterator invalidated, need to backup concat node + UnboundedRegExpElement * tmpItA = *itA; + delete *iterAY; + node->elements.erase( iterAY ); + + // iterator invalidated, need to recall before erase + itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); + + UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); + tmpAltered->elements.push_back( regexpA ); + tmpAltered->elements.push_back( * itC ); + tmpAltered->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); + + tmpAltered->elements.clear( ); + delete tmpAltered; + + delete regexpA; + delete regexpY; + delete regexpAY; + + delete *itA; + itA = node->elements.erase( itA ); + + node->elements.insert( itA, regexpAltered ); + optimized = true; + break; + } + + itA ++; + } + + return optimized; +} + +/** + * optimization V8: \e in h(x) => xx*=x* + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V8( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // interpretation: if there is iteration in concatenation node, and element of iteration contains eps and is straight before this iteration, then this element can be omitted + + for( auto it = next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); + + if( ! iter ) + { + it ++; + continue; + } + + // if element of iteration is concatenation, we need to check this specially + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + if( concat ) + { + // check if not out of bounds + if( distance( node->elements.begin( ), it ) < distance( concat->elements.begin(), concat->elements.end() ) ) + { + it ++; + continue; + } + + //FIXME: int cast + auto it2 = it; + advance( it2, - (int)concat->elements.size( ) ); + + if( concat->containsEmptyString( ) && + distance( concat->elements.begin( ), concat->elements.end( )) == distance ( it2, node->elements.end( ) ) && + equal( concat->elements.begin( ), concat->elements.end( ), it2, [] ( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a == *b; } ) ) + { + optimized = true; + + for( auto delIt = it2 ; delIt != it ; delIt ++ ) + delete *delIt; + it = node->elements.erase( it2, it ); + } + else + { + it ++; + } + } + // else + else + { + if( it == node->elements.begin( ) ) + { + it++; + continue; + } + + auto prev = std::prev( it ); + + if( iter->element->containsEmptyString( ) && *( iter->element ) == **prev ) + { + delete * prev; + it = node->elements.erase( prev ); + optimized = true; + + // in case xxx*, we need to stay on the iter element, not to go behind it + if( it != node->elements.begin( ) ) + it = std::prev( it ); + } + else + { + it ++; + } + } + } + + return optimized; +} + +/** + * optimization V9: (xy)*x = x(yx)* + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V9( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // interpretation: if concat (C1) with iter && iteration's element is concat (C2), then: + // simultaneously iterate through C1 and C2. (axy)*axz=ax(yax)*z -> get ax that is same and relocate them... + + for( auto it = node->elements.begin( ) ; it != node->elements.end( ) ; ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); + if ( ! iter ) + { + it++; + continue; + } + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + if( ! concat ) + { + it++; + continue; + } + + // find range from <it+1;sth> and <concat.begin;sth> that is equal + auto c1Iter = std::next( it ), c2Iter = concat->elements.begin( ); + while( c1Iter != node->elements.end() && c2Iter != concat->elements.end( ) && **c1Iter == ** c2Iter ) + { + c1Iter ++; + c2Iter ++; + } + + if( c1Iter == std::next( it ) ) + { + it ++; + continue; + } + + // std::cout << "xy" << std::endl; + // UnboundedRegExpConcatenation* tmp = new UnboundedRegExpConcatenation( ); + // tmp->elements.insert( tmp->elements.end( ), std::next( it ), c1Iter ); + // std::cout << RegExp( tmp ) << std::endl; + + // copy the range <it;sth>, delete it and go back to the iter node + std::vector<UnboundedRegExpElement*> copyRange; + copyRange.insert( copyRange.end(), std::next( it ), c1Iter ); + it = node->elements.erase( std::next( it ), c1Iter ); + it = std::prev( it ); + + // insert that range before it position + node->elements.insert( it, copyRange.begin( ), copyRange.end( ) ); + + // alter the iteration's concat node + copyRange.clear( ); + copyRange.insert( copyRange.end(), concat->elements.begin( ), c2Iter ); + concat->elements.erase( concat->elements.begin( ), c2Iter ); + concat->elements.insert( concat->elements.end(), copyRange.begin( ), copyRange.end( ) ); + } + + return optimized; +} + +/** + * optimization V10: (x+y)* = (x*+y*)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V10( UnboundedRegExpIteration * const & node ) +{ + // interpretation: if iter's child is alternation where its every child is iteration, then they do not have to be iteration + UnboundedRegExpAlternation* alt = dynamic_cast<UnboundedRegExpAlternation*>( node->element ); + if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) + return false; + + UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); + + for( const auto & n : alt->elements ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + newAlt->elements.push_back( iter->element ); + iter->element = NULL; + } + + node->element = optimize( newAlt ); + delete alt; + delete newAlt; + + return true; +} + +/** + * optimization X1: a* + \e = a* + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::X1( UnboundedRegExpAlternation * const & node ) +{ + // theorem: In regexp like a* + \e, \e is described twice, first in a*, second in \e. + // therefore we can delete the \e as it is redundant + + auto iter = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpIteration const * const>( a );} ); + auto eps = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpEpsilon const * const>( a );} ); + + if( iter != node->elements.end( ) && eps != node->elements.end( ) ) + { + delete *eps; + node->elements.erase( eps ); + return true; + } + + return false; +} + +} diff --git a/alib2algo/src/regexp/RegExpOptimize.h b/alib2algo/src/regexp/RegExpOptimize.h new file mode 100644 index 0000000000..afb794762d --- /dev/null +++ b/alib2algo/src/regexp/RegExpOptimize.h @@ -0,0 +1,99 @@ +/* + * RegExpOptimize.h + * + * Created on: 20. 1. 2014 + * Author: Tomas Pecka + */ + +#ifndef REGEXPOPTIMIZE_H_ +#define REGEXPOPTIMIZE_H_ + +#include <algorithm> +#include <functional> +#include <iterator> + +#include <regexp/unbounded/UnboundedRegExp.h> +#include <regexp/unbounded/UnboundedRegExpElements.h> + +#include <exception/AlibException.h> + +namespace regexp { + +/* + * Optimizes RegExp (or its subtree) using axioms defined in Melichar 2.87 + * (A1 to A10) and Melichar 2.95(V1 through V6 and V8, V9, V10) + * All methods return new tree. + * + * List of optimization on nodes: + * - RegExpAlternation: A1, A2, A3, A4, A9, V2, V5, V6 + * - RegExpConcatenation: A5, A6, A7, A8, V8, V9 + * - RegExpIteration: A10, V1, V3, V4, V10 + * + * Details: ( id : direction of optim. : optim ) + * - A1 : -> : x + ( y + z ) = ( x + y ) + z = x + y + z + * - A2 : <- : x + y = y + x + * - A3 : -> : x + \0 = x + * - A4 : -> : x + x = x + * - A5 : -> : x(yz) = (xy)z = xyz + * - A6 : -> : \ex = x\e = x + * - A7 : -> : \0x = x\0 = \0 + * - A8 : -> : x( y + z ) = xy + xz + * - A9 : -> : ( x + y )z = xz + yz + * - A10: <- : x* = \e + x*x + * - A11: <- : x* = ( \e + x )* + * - V1 : -> : \0* = \e + * - V2 : -> : x* + x = x* + * - V3 : -> : x** = x* + * - V4 : <- : ( x + y )* = (x*y*)* + * - V5 : <- : x*y = y + x*xy + * - V6 : <- : x*y = y + xx*y + * - V7 : : bleh + * - V8 : -> : if \e in h(x) => xx* = x* + * - V9 : -> : (xy)*x = x(yx)* + * - V10: <- : ( x + y )* = ( x* + y* )* + * + * - X1 : -> : a* + \e = a* + */ +class RegExpOptimize +{ +public: + regexp::UnboundedRegExp optimize( const regexp::UnboundedRegExp & regexp ); +private: + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpElement const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpAlternation const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpConcatenation const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpIteration const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpSymbol const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpEpsilon const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpEmpty const * const & node ); + + + +private: + bool A1( regexp::UnboundedRegExpAlternation * const & node ); + bool A2( regexp::UnboundedRegExpAlternation * const & node ); + bool A3( regexp::UnboundedRegExpAlternation * const & node ); + bool A4( regexp::UnboundedRegExpAlternation * const & node ); + bool A5( regexp::UnboundedRegExpConcatenation * const & node ); + bool A6( regexp::UnboundedRegExpConcatenation * const & node ); + bool A7( regexp::UnboundedRegExpConcatenation * const & node ); + bool A8( regexp::UnboundedRegExpConcatenation * const & node ); + bool A9( regexp::UnboundedRegExpConcatenation * const & node ); + bool A10( regexp::UnboundedRegExpAlternation * const & node ); + bool A11( regexp::UnboundedRegExpIteration * const & node ); + bool V1( regexp::UnboundedRegExpIteration * const & node ); + bool V2( regexp::UnboundedRegExpAlternation * const & node ); + bool V3( regexp::UnboundedRegExpIteration * const & node ); + bool V4( regexp::UnboundedRegExpIteration * const & node ); + bool V5( regexp::UnboundedRegExpAlternation * const & node ); + bool V6( regexp::UnboundedRegExpAlternation * const & node ); + bool V8( regexp::UnboundedRegExpConcatenation * const & node ); + bool V9( regexp::UnboundedRegExpConcatenation * const & node ); + bool V10( regexp::UnboundedRegExpIteration * const & node ); + + bool X1( regexp::UnboundedRegExpAlternation * const & node ); +}; + +} + +#endif /* REGEXPNORMALIZE_H_ */ diff --git a/alib2algo/test-src/regexp/RegExpOptimizeTest.cpp b/alib2algo/test-src/regexp/RegExpOptimizeTest.cpp new file mode 100644 index 0000000000..13896b5189 --- /dev/null +++ b/alib2algo/test-src/regexp/RegExpOptimizeTest.cpp @@ -0,0 +1,42 @@ +#include <list> +#include "RegExpOptimizeTest.h" + +#include "regexp/unbounded/UnboundedRegExp.h" +#include "regexp/RegExpFromStringParser.h" + +#include "regexp/RegExpOptimize.h" + +#define CPPUNIT_IMPLY(x, y) CPPUNIT_ASSERT(!(x) || (y)) +#define CPPUNIT_EXCLUSIVE_OR(x, y) CPPUNIT_ASSERT((!(x) && (y)) || ((x) && !(y))) + +CPPUNIT_TEST_SUITE_REGISTRATION( RegExpOptimizeTest ); + +void RegExpOptimizeTest::setUp() { +} + +void RegExpOptimizeTest::tearDown() { +} + +void RegExpOptimizeTest::testOptimize() { + { + std::string input = "(a+a)b + (#0 b + (#0 a + (#0 b + a)))"; + std::stringstream inputs(input); + + regexp::RegExpFromStringParser parser(inputs); + regexp::UnboundedRegExp regexp( static_cast<const regexp::UnboundedRegExp &>( parser.parseValue().getData() ) ); + + regexp::RegExpOptimize opt; + regexp::UnboundedRegExp res = opt.optimize(regexp); + std::cout << res << std::endl; + } + { + std::string input = "a+a* (b+a)* c"; + std::stringstream inputs(input); + + regexp::RegExpFromStringParser parser(inputs); + regexp::UnboundedRegExp regexp( static_cast<const regexp::UnboundedRegExp &>( parser.parseValue().getData() ) ); + + } + +} + diff --git a/alib2algo/test-src/regexp/RegExpOptimizeTest.h b/alib2algo/test-src/regexp/RegExpOptimizeTest.h new file mode 100644 index 0000000000..4e584efdc0 --- /dev/null +++ b/alib2algo/test-src/regexp/RegExpOptimizeTest.h @@ -0,0 +1,19 @@ +#ifndef REG_EXP_OPTIMIZE_TEST_H_ +#define REG_EXP_OPTIMIZE_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class RegExpOptimizeTest : public CppUnit::TestFixture +{ + CPPUNIT_TEST_SUITE( RegExpOptimizeTest ); + CPPUNIT_TEST( testOptimize ); + CPPUNIT_TEST_SUITE_END(); + +public: + void setUp(); + void tearDown(); + + void testOptimize(); +}; + +#endif // REG_EXP_OPTIMIZE_TEST_H_ diff --git a/alib2data/src/regexp/unbounded/UnboundedRegExpAlternation.h b/alib2data/src/regexp/unbounded/UnboundedRegExpAlternation.h index 819b3d4c96..96e9547215 100644 --- a/alib2data/src/regexp/unbounded/UnboundedRegExpAlternation.h +++ b/alib2data/src/regexp/unbounded/UnboundedRegExpAlternation.h @@ -13,6 +13,8 @@ namespace regexp { +class RegExpOptimize; + /** * Represents alternation operator in the regular expression. Contains list of UnboundedRegExpElement * as operands of the operator. @@ -101,6 +103,8 @@ public: * @copydoc UnboundedRegExpElement::isEmpty() const */ virtual bool isEmpty() const; + + friend class RegExpOptimize; }; } /* namespace regexp */ diff --git a/alib2data/src/regexp/unbounded/UnboundedRegExpConcatenation.h b/alib2data/src/regexp/unbounded/UnboundedRegExpConcatenation.h index 7710de8291..deb28d5af1 100644 --- a/alib2data/src/regexp/unbounded/UnboundedRegExpConcatenation.h +++ b/alib2data/src/regexp/unbounded/UnboundedRegExpConcatenation.h @@ -13,6 +13,8 @@ namespace regexp { +class RegExpOptimize; + /** * Represents concatenation operator in the regular expression. Contains list of UnboundedRegExpElement * as operands of the operator. @@ -100,6 +102,8 @@ public: * @copydoc UnboundedRegExpElement::isEmpty() const */ virtual bool isEmpty() const; + + friend class RegExpOptimize; }; } /* namespace regexp */ diff --git a/alib2data/src/regexp/unbounded/UnboundedRegExpIteration.h b/alib2data/src/regexp/unbounded/UnboundedRegExpIteration.h index d2e3566ddd..75cb849ee2 100644 --- a/alib2data/src/regexp/unbounded/UnboundedRegExpIteration.h +++ b/alib2data/src/regexp/unbounded/UnboundedRegExpIteration.h @@ -13,6 +13,8 @@ namespace regexp { +class RegExpOptimize; + /** * Represents iteration operator in the regular expression. Contains one UnboundedRegExpElement * as operand. @@ -99,6 +101,8 @@ public: * @copydoc UnboundedRegExpElement::isEmpty() const */ virtual bool isEmpty() const; + + friend class RegExpOptimize; }; } /* namespace regexp */ -- GitLab