From 47e55caf3b40542597d82380d61493953b92980d Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Fri, 17 Oct 2014 21:08:32 +0200 Subject: [PATCH] base implementation of FormalRegExp optimizations --- alib2algo/src/regexp/RegExpOptimize.cpp | 1227 +---------------- alib2algo/src/regexp/RegExpOptimize.h | 26 + .../src/regexp/RegExpOptimizeFormalPart.cxx | 570 ++++++++ .../regexp/RegExpOptimizeUnboundedPart.cxx | 1162 ++++++++++++++++ .../regexp/formal/FormalRegExpAlternation.h | 2 + .../regexp/formal/FormalRegExpConcatenation.h | 2 + .../src/regexp/formal/FormalRegExpIteration.h | 2 + 7 files changed, 1816 insertions(+), 1175 deletions(-) create mode 100644 alib2algo/src/regexp/RegExpOptimizeFormalPart.cxx create mode 100644 alib2algo/src/regexp/RegExpOptimizeUnboundedPart.cxx diff --git a/alib2algo/src/regexp/RegExpOptimize.cpp b/alib2algo/src/regexp/RegExpOptimize.cpp index 73a9740a4c..ed2e2c0049 100644 --- a/alib2algo/src/regexp/RegExpOptimize.cpp +++ b/alib2algo/src/regexp/RegExpOptimize.cpp @@ -40,12 +40,60 @@ void RegExpOptimize::Visit(void* userData, const regexp::UnboundedRegExp& regexp FormalRegExp RegExpOptimize::optimize( FormalRegExp const & regexp ) { - throw exception::AlibException("Unimplemented"); + FormalRegExpElement* optimized = optimize( & regexp.getRegExp( ) ); + + FormalRegExp ret( std::move( * optimized ) ); + + delete optimized; + + return ret; } void RegExpOptimize::optimize( FormalRegExpElement & element ) { - throw exception::AlibException("Unimplemented"); + FormalRegExpElement* optimized = optimize( & element ); + + FormalRegExpAlternation * alternation = dynamic_cast<FormalRegExpAlternation *>( & element ); + if( alternation ) { + FormalRegExpAlternation * alternationOptimized = dynamic_cast<FormalRegExpAlternation *>( optimized ); + if( alternationOptimized ) { + * alternation = std::move( * alternationOptimized ); + delete alternationOptimized; + } else { + * alternation = FormalRegExpAlternation { std::move( * optimized ), FormalRegExpEmpty { } }; + delete optimized; + } + return; + } + + FormalRegExpConcatenation * concatenation = dynamic_cast<FormalRegExpConcatenation *>( & element ); + if( concatenation ) { + FormalRegExpConcatenation * concatenationOptimized = dynamic_cast<FormalRegExpConcatenation *>( optimized ); + if( concatenationOptimized ) { + * concatenation = std::move( * concatenationOptimized ); + delete concatenationOptimized; + } else { + * concatenation = FormalRegExpConcatenation { std::move( * optimized ), FormalRegExpEpsilon { } }; + delete optimized; + } + return; + } + + FormalRegExpIteration * iteration = dynamic_cast<FormalRegExpIteration *>( & element ); + if( iteration ) { + FormalRegExpIteration * iterationOptimized = dynamic_cast<FormalRegExpIteration *>( optimized ); + if( iterationOptimized ) { + * iteration = std::move( * iterationOptimized ); + delete iterationOptimized; + } else { + * iteration = FormalRegExpIteration { std::move( * optimized ) }; + delete optimized; + } + return; + } + + // Nothing to optimize original element was FormalRegExpSymbol, FormalRegExpEpsilon, or FormalRegExpEmpty + return; } UnboundedRegExp RegExpOptimize::optimize( UnboundedRegExp const & regexp ) @@ -107,1178 +155,7 @@ void RegExpOptimize::optimize( UnboundedRegExpElement & element ) { return; } -UnboundedRegExpElement* RegExpOptimize::optimize( UnboundedRegExpElement const * const & node ) -{ - const UnboundedRegExpAlternation * alternation = dynamic_cast<const UnboundedRegExpAlternation*>( node ); - if( alternation ) - return optimize( alternation ); - - const UnboundedRegExpConcatenation * concatenation = dynamic_cast<const UnboundedRegExpConcatenation*>( node ); - if( concatenation ) - return optimize( concatenation ); - - const UnboundedRegExpIteration * iteration = dynamic_cast<const UnboundedRegExpIteration*>( node ); - if( iteration ) - return optimize( iteration ); - - const UnboundedRegExpSymbol * symbol = dynamic_cast<const UnboundedRegExpSymbol*>( node ); - if( symbol ) - return optimize( symbol ); - - const UnboundedRegExpEmpty * empty= dynamic_cast<const UnboundedRegExpEmpty*>( node ); - if( empty ) - return optimize( empty ); - - const UnboundedRegExpEpsilon * eps = dynamic_cast<const UnboundedRegExpEpsilon*>( node ); - if( eps ) - return optimize( eps ); - - throw exception::AlibException( "RegExpOptimize::optimize - unknown UnboundedRegExpElement node" ); -} - - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpAlternation const * const & node ) -{ - UnboundedRegExpAlternation* alt = new UnboundedRegExpAlternation( ); - - for( const auto & child : node->elements ) - alt->elements.push_back( optimize( child ) ); - - // optimize while you can - while( A1( alt ) || A2( alt ) || A3( alt ) || A4( alt ) || A10( alt ) || V2( alt ) || V5( alt ) || V6( alt ) || X1( alt ) ); - - if( alt->elements.size( ) == 1 ) - { - UnboundedRegExpElement* ret = alt->elements.front( ); - alt->elements.clear( ); - delete alt; - return ret; - } - - if( alt->elements.size( ) == 0 ) { - delete alt; - return new UnboundedRegExpEmpty( ); - } - - return alt; -} - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpConcatenation const * const & node ) -{ - UnboundedRegExpConcatenation* concat = new UnboundedRegExpConcatenation( ); - - for( const auto & child : node->elements ) - concat->elements.push_back( optimize( child ) ); - - do - { - // A7 is implemented here ~ if not here, it went into infinite loop FIXME - if( std::any_of( concat->elements.begin( ), concat->elements.end( ), []( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpEmpty const *>( a ); } ) ) - { - delete concat; - return new UnboundedRegExpEmpty( ); - } - } - while( A5( concat ) || A6( concat ) || /* A7( concat ) || */ A8( concat ) || A9( concat ) || V8( concat ) );//|| V9( concat ) ); - - if( concat->elements.size( ) == 1 ) - { - UnboundedRegExpElement* ret = concat->elements.front( ); - concat->elements.clear( ); - delete concat; - return ret; - } - - if( concat->elements.size( ) == 0 ) { - delete concat; - return new UnboundedRegExpEpsilon( ); - } - - return concat; -} - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpIteration const * const & node ) -{ - UnboundedRegExpIteration* iter = new UnboundedRegExpIteration(* optimize( node->element ) ); - - do - { - // V1 is implemented right here - if( dynamic_cast<UnboundedRegExpEmpty*>( iter->element ) ) - { - delete iter; - return new UnboundedRegExpEpsilon( ); - } - } - while( A11( iter ) || V1( iter ) || V3( iter ) || V4( iter ) || V10( iter ) ); - - return iter; -} - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpSymbol const * const & node ) -{ - return node->clone( ); -} - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEmpty const * const & node ) -{ - return node->clone( ); -} - -UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEpsilon const * const & node ) -{ - return node->clone( ); -} - -// ---------------------------------------------------------------------------- - -/** - * optimization A1: x + ( y + z ) = ( x + y ) + z = x + y + z - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A1( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false; - - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - UnboundedRegExpAlternation * const & childUnboundedRegExpAlternation = dynamic_cast<UnboundedRegExpAlternation *>( * it ); - - if( childUnboundedRegExpAlternation ) - { - it = node->elements.erase( it ); - - size_t off = it - node->elements.begin(); - node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); - it = node->elements.begin() + off; - - //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); - - childUnboundedRegExpAlternation->elements.clear( ); - delete childUnboundedRegExpAlternation; - - optimized = true; - } - else - { - it ++; - } - } - - return optimized; -} - -/** - * optimization A2: x + y = y + x (sort) - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A2( UnboundedRegExpAlternation * const & node ) -{ - std::function<bool( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b )> cmp = [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a < *b; }; - - if( std::is_sorted( node->elements.begin( ), node->elements.end( ), cmp ) ) - return false; - - std::sort( node->elements.begin(), node->elements.end(), cmp ); - return true; -} - -/** - * optimization A3: x + \0 = x - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A3( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false; - - // input can be \0 + \0, so at least one element must be preserved - - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - UnboundedRegExpEmpty const * const & empty = dynamic_cast<UnboundedRegExpEmpty const *>( * it ); - - if( empty && node->elements.size( ) > 1 ) - { - it = node->elements.erase( it ); - delete empty; - - optimized = true; - } - else - { - it ++; - } - } - - return optimized; -} - -/** - * optimization A4: x + x = x - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A4( UnboundedRegExpAlternation * const & node ) -{ - /* - * two ways of implementing this opitimization: - * - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts... - * - check every element against other ( O(n*n) ) - * - * As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2 - */ - - bool optimized = false; - if(node->elements.size() != 0) for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) - { - if ( ** it == ** std::prev( it ) ) - { - delete * it; - it = node->elements.erase( it ); - optimized = true; - } - else - { - it ++; - } - } - - return optimized; -} - -/** - * optimization A5: x.(y.z) = (x.y).z = x.y.z - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A5( UnboundedRegExpConcatenation * const & node ) -{ - bool optimized = false; - - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - UnboundedRegExpConcatenation * const & childUnboundedRegExpConcatenation = dynamic_cast<UnboundedRegExpConcatenation *>( * it ); - - if( childUnboundedRegExpConcatenation ) - { - it = node->elements.erase( it ); - - size_t off = it - node->elements.begin(); - node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); - it = node->elements.begin() + off; - - //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); - - childUnboundedRegExpConcatenation->elements.clear( ); - delete childUnboundedRegExpConcatenation; - - optimized = true; - } - else - it ++; - } - - return optimized; -} - -/** - * optimization A6: \e.x = x.\e = x - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A6( UnboundedRegExpConcatenation * const & node ) -{ - bool optimized = false; - - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - UnboundedRegExpEpsilon* epsilon = dynamic_cast<UnboundedRegExpEpsilon*>( * it ); - if( epsilon && node->elements.size( ) > 1 ) - { - delete * it; - it = node->elements.erase( it ); - - optimized = true; - } - else - it ++; - } - - return optimized; -} - -/** - * optimization A7: \0.x = x.\0 = \0 - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A7( UnboundedRegExpConcatenation * const & node ) -{ - bool optimized = false; - - // FIXME: alib2 uses shared_ptrs, rewrite this using remove_if then - - if( std::any_of( node->elements.begin( ), node->elements.end( ), []( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpEmpty const *>( a ); } ) ) - { - for( auto const& child : node->elements ) - delete child; - - node->elements.clear( ); - node->elements.push_back( new UnboundedRegExpEmpty( ) ); - - optimized = true; - } - - return optimized; -} - -/** - * optimization A8: x.(y+z) = x.y + x.z - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A8( UnboundedRegExpConcatenation * const & node ) -{ -/* - bool optimized = false; - - for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) - { - UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); - if( ! alt ) - { - it ++; - continue; - } - - // take everything to the left and copy it as prefix of every element in alternation. - UnboundedRegExpConcatenation * leftPart = new UnboundedRegExpConcatenation( ); - leftPart->elements.insert( leftPart->elements.end( ), node->elements.begin( ), it ); - - for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) - { - UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); - altElem->elements.push_back( leftPart->clone( ) ); - altElem->elements.push_back( * altIt ); - - * altIt = altElem; - } - - UnboundedRegExpElement * optIt = optimize( * it ); - delete *it; - *it = optIt; - - delete leftPart; - it = node->elements.erase( node->elements.begin( ), it ); - - optimized = true; - it ++; - } - - return optimized; -*/ - return false; -} - -/** - * optimization A9: (x+y).z = x.z + y.z - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A9( UnboundedRegExpConcatenation * const & node ) -{ -/* - bool optimized = false; - - for( auto it = node->elements.begin( ); it != std::prev( node->elements.end( ) ); ) - { - UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); - if( ! alt ) - { - it ++; - continue; - } - - // take everything to the right and copy it as suffix of every element in alternation. - UnboundedRegExpConcatenation * rest = new UnboundedRegExpConcatenation( ); - rest->elements.insert( rest->elements.end( ), std::next( it ), node->elements.end( ) ); - - for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) - { - UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); - altElem->elements.push_back( * altIt ); - altElem->elements.push_back( rest->clone( ) ); - - * altIt = altElem; - } - - UnboundedRegExpElement * optIt = optimize( * it ); - delete *it; - *it = optIt; - - delete rest; - it = node->elements.erase( std::next( it ), node->elements.end( ) ); - optimized = true; - - // as we move (delete) the rest of this expression, it surely wont do another round. More optimizations to be performerd are in subtree now. - // we do not care about this here as method optimize(UnboundedRegExpAlternation) will take care of this in next iteration - // it ++; - break; - } - - return optimized; -*/ - return false; -} - -/** - * optimization A10: x* = \e + x*x - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A10( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false, optimizedIter = false; - - /* - * problem: - * - \e + x*x = x* - * - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node). - */ - - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - optimizedIter = false; - - // check if we have some epsilon or iteration left, else nothing to do - auto eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { - return dynamic_cast<UnboundedRegExpEpsilon const *>( a ) || dynamic_cast<UnboundedRegExpIteration const*>( a ); - }); - if( eps == node->elements.end( ) ) - break; - - UnboundedRegExpConcatenation const * const & childConcat = dynamic_cast<UnboundedRegExpConcatenation const *>( *it ); - if( childConcat ) - { - // if iteration is first element of concatenation - UnboundedRegExpIteration const * const & iter = dynamic_cast<UnboundedRegExpIteration const *>( childConcat->elements.front( ) ); - - if( iter ) - { - // concatenation without the iteration node - UnboundedRegExpConcatenation *tmpConcat = dynamic_cast<UnboundedRegExpConcatenation *>( childConcat->clone( ) ); - delete tmpConcat->elements.front( ); - tmpConcat->elements.erase( tmpConcat->elements.begin( ) ); - UnboundedRegExpElement * tmpConcatOpt = optimize( tmpConcat ); - - // check if iteration element is the same subtree as rest of concatenation - if( * iter->element == * tmpConcatOpt ) - { - optimized = optimizedIter = true; - - size_t off = it - node->elements.begin(); - node->elements.push_back( iter->clone( ) ); - it = node->elements.begin() + off; - - delete childConcat; - it = node->elements.erase( it ); - - // find the eps again - invalidated after prev erase - eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { - return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); - }); - // if it was eps, delete it - // if it was not the eps but iteration, keep it - if( eps != node->elements.end( ) ) - { - delete *eps; - it = node->elements.erase( eps ); - } - } - delete tmpConcat; - delete tmpConcatOpt; - } - } - - if( ! optimizedIter ) - it ++; - } - - return optimized; -} - -/** - * optimization A11: x* = (\e + x)* - * @param node UnboundedRegExpIteration node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::A11( UnboundedRegExpIteration * const & node ) -{ - bool optimized = false; - - UnboundedRegExpAlternation * const & childAlt = dynamic_cast<UnboundedRegExpAlternation *>( node->element ); - - if( childAlt ) - { - // check if eps inside iteration's alternation - auto eps = find_if( childAlt->elements.begin( ), childAlt->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { - return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); - }); - - // if no eps - if( eps == childAlt->elements.end( ) ) - return false; - - // remove eps from alternation - optimized = true; - delete * eps; - childAlt->elements.erase( eps ); - } - - return optimized; -} - -/** - * optimization V1: \0* = \e - * @param node UnboundedRegExpIteration node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V1( UnboundedRegExpIteration * const & node ) -{ - // implemented in optimize( UnboundedRegExpIteration ) - - return false; -} - -/** - * optimization V2: x* + x = x* - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V2( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false; - - /* - * Bit tricky - * We need also to cover the cases like (a+b)* + a + b + c = (a+b)* + c - */ - - std::list<UnboundedRegExpElement*> iterElements; - // cache iter elements because of operator invalidation after erase - for( const auto & n : node->elements ) - { - UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( n ); - if( iter ) - iterElements.push_back( iter->element ); - } - - for( const auto & n : iterElements ) - { - // if alternation is inside, we need to make sure that every element of alternation is inside node->elements. if so, delete them all - UnboundedRegExpAlternation * tmpAlt = dynamic_cast<UnboundedRegExpAlternation*>( n ); - if( tmpAlt ) - { - bool every = true; - for( const auto & altElem : tmpAlt->elements ) - { - auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { - return *a == *altElem; - }); - - if( it == node->elements.end( ) ) - every = false; - } - - if ( every == true ) - { - optimized = true; - - for( const auto & altElem : tmpAlt->elements ) - { - auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { - return *a == *altElem; - }); - assert( it != node->elements.end( ) ); - - delete *it; - node->elements.erase( it ); - } - } - } - - // else - for( auto it = node->elements.begin( ); it != node->elements.end( ); ) - { - if( *n == **it ) - { - optimized = true; - - delete *it; - it = node->elements.erase( it ); - } - else - { - it ++; - } - } - } - - return optimized; -} - -/** - * optimization V3: x** = x* - * @param node UnboundedRegExpIteration node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V3( UnboundedRegExpIteration * const & node ) -{ - UnboundedRegExpIteration* childIter = dynamic_cast<UnboundedRegExpIteration*>( node->element ); - if( childIter ) - { - node->element = childIter->element; - childIter->element = NULL; - delete childIter; - - return true; - } - - return false; -} - -/** - * optimization V4: (x+y)* = (x*y*)* - * @param node UnboundedRegExpIteration node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V4( UnboundedRegExpIteration * const & node ) -{ - // interpretation: if iteration's element is concat and every concat's element is iteration - UnboundedRegExpConcatenation* alt = dynamic_cast<UnboundedRegExpConcatenation*>( node->element ); - if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) - return false; - - UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); - - for( const auto & n : alt->elements ) - { - UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); - newAlt->elements.push_back( iter->element ); - iter->element = NULL; - } - - node->element = optimize( newAlt ); - delete alt; - delete newAlt; - - return true; -} - -/** - * optimization V5: x*y = y + x*xy - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V5( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false; - - // reinterpretation: ax*y = ay+ax*xy - // so, if we find iter, a = everything that is before it (prefix) - // x = iter's content - // behind iter must be exactly iter's content - // y = rest (suffix) - // prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix - - for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) - { - UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); - if( ! concat ) - { - itA ++; - continue; - } - - for( auto itC = concat->elements.begin( ); itC != std::prev( concat->elements.end( ) ); ) - { - UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( *itC ); - if( ! iter ) - { - itC ++; - continue; - } - - // iteration's element must follow the iteration (x*x) - auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy - - // if iter's element is concat - if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) - { - UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); - - // std::cout << "....." << std::endl; - // std::cout << RegExp( concat ) << std::endl; - // std::cout << RegExp( iterConcat ) << std::endl; - // UnboundedRegExpConcatenation * tmp = new UnboundedRegExpConcatenation( ); - // tmp->elements.insert( tmp->elements.end( ), std::next( itC ), concat->elements.end( ) ); - // std::cout << RegExp( tmp) << std::endl; - - if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( std::next( itC ), concat->elements.end( ) ) - || ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), std::next( itC ), - [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) - { - itC++; - continue; - } - advance( itStartY, (int)iterConcat->elements.size( ) ); - } - // else - else - { - if( * iter->element != ** std::next( itC ) ) - { - itC ++; - continue; - } - - advance( itStartY, 1 ); - } - - // store everything before iteration as "a" - UnboundedRegExpElement * regexpA; - if( concat->elements.begin( ) == itC ) - { - regexpA = new UnboundedRegExpEpsilon( ); - } - else - { - UnboundedRegExpConcatenation * tmpA = new UnboundedRegExpConcatenation( ); - tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itC ); - regexpA = optimize( tmpA ); - tmpA->elements.clear( ); - delete tmpA; - } - - // store everything behind iteration's followup element as "y" - UnboundedRegExpElement * regexpY; - if( itStartY == concat->elements.end( ) ) - { - regexpY = new UnboundedRegExpEpsilon( ); - } - else - { - UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); - tmpY->elements.insert( tmpY->elements.end( ), itStartY, concat->elements.end( ) ); - regexpY = optimize( tmpY ); - tmpY->elements.clear( ); - delete tmpY; - } - - // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) - UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); - tmpAY->elements.push_back( regexpA ); - tmpAY->elements.push_back( regexpY ); - UnboundedRegExpElement * regexpAY = optimize( tmpAY ); - tmpAY->elements.clear( ); - delete tmpAY; - - auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); - if( iterAY == node->elements.end( ) ) - { - itC ++; - continue; - } - - // if AY exists, then we can simply do this: - //iterator invalidated, need to backup concat node - UnboundedRegExpElement * tmpItA = *itA; - - delete *iterAY; - node->elements.erase( iterAY ); - - // iterator invalidated, need to recall before erase - itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); - - UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); - tmpAltered->elements.push_back( regexpA ); - tmpAltered->elements.push_back( * itC ); - tmpAltered->elements.push_back( regexpY ); - UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); - - tmpAltered->elements.clear( ); - delete tmpAltered; - - delete regexpA; - delete regexpY; - delete regexpAY; - - delete *itA; - itA = node->elements.erase( itA ); - - node->elements.insert( itA, regexpAltered ); - - optimized = true; - break; - } - - itA ++; - } - - return optimized; -} - -/** - * optimization V6: x*y = y + xx*y - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V6( UnboundedRegExpAlternation * const & node ) -{ - bool optimized = false; - - // reinterpretation: ax*y = ay+axx*y - // so, if we find iter, a = everything that is before it (prefix) - // x = iter's content - // before iter must be exactly iter's content - // y = rest (suffix) - // prefix.xx*.suffix + prefix.suffix = prefix.x*.suffix - - for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) - { - UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); - if( ! concat ) - { - itA ++; - continue; - } - - for( auto itC = std::next( concat->elements.begin( ) ); itC != concat->elements.end( ); ) - { - UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * itC ); - if( ! iter ) - { - itC ++; - continue; - } - - // iteration's element must preceed the iteration (xx*) - auto itStartX = itC; //itStartX points to first x in expression xx*, everything before is therefore prefix - regexp "a" - - // if iter's element is concat - if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) - { - UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); - - if( distance( concat->elements.begin( ), itC ) < (int)iterConcat->elements.size( ) ) - { - itC ++; - continue; - } - advance( itStartX, - (int)(iterConcat->elements.size( ) ) ); - - if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( itStartX, concat->elements.end( ) ) - || - ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), itStartX, - []( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) - { - itC++; - continue; - } - } - // else - else - { - if( * iter->element != ** std::prev( itC ) ) - { - itC ++; - continue; - } - - advance( itStartX, -1 ); - } - - // store everything before x as "a" - UnboundedRegExpElement * regexpA; - if( concat->elements.begin( ) == itStartX ) - { - regexpA = new UnboundedRegExpEpsilon( ); - } - else - { - UnboundedRegExpConcatenation* tmpA = new UnboundedRegExpConcatenation( ); - tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itStartX ); - regexpA = optimize( tmpA ); - tmpA->elements.clear( ); - delete tmpA; - } - - // store everything behind iteration's followup element as "y" - UnboundedRegExpElement * regexpY; - if( std::next( itC ) == concat->elements.end( ) ) - { - regexpY = new UnboundedRegExpEpsilon( ); - } - else - { - UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); - tmpY->elements.insert( tmpY->elements.end( ), std::next( itC ), concat->elements.end( ) ); - regexpY = optimize( tmpY ); - tmpY->elements.clear( ); - delete tmpY; - } - - // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) - UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); - tmpAY->elements.push_back( regexpA ); - tmpAY->elements.push_back( regexpY ); - UnboundedRegExpElement * regexpAY = optimize( tmpAY ); - tmpAY->elements.clear( ); - delete tmpAY; - - auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); - if( iterAY == node->elements.end( ) ) - { - itC ++; - continue; - } - - // if AY exists, then we can simply do this: - //iterator invalidated, need to backup concat node - UnboundedRegExpElement * tmpItA = *itA; - delete *iterAY; - node->elements.erase( iterAY ); - - // iterator invalidated, need to recall before erase - itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); - - UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); - tmpAltered->elements.push_back( regexpA ); - tmpAltered->elements.push_back( * itC ); - tmpAltered->elements.push_back( regexpY ); - UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); - - tmpAltered->elements.clear( ); - delete tmpAltered; - - delete regexpA; - delete regexpY; - delete regexpAY; - - delete *itA; - itA = node->elements.erase( itA ); - - node->elements.insert( itA, regexpAltered ); - optimized = true; - break; - } - - itA ++; - } - - return optimized; -} - -/** - * optimization V8: \e in h(x) => xx*=x* - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V8( UnboundedRegExpConcatenation * const & node ) -{ - bool optimized = false; - - // interpretation: if there is iteration in concatenation node, and element of iteration contains eps and is straight before this iteration, then this element can be omitted - - for( auto it = next( node->elements.begin( ) ); it != node->elements.end( ); ) - { - UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); - - if( ! iter ) - { - it ++; - continue; - } - - // if element of iteration is concatenation, we need to check this specially - UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); - - if( concat ) - { - // check if not out of bounds - if( distance( node->elements.begin( ), it ) < distance( concat->elements.begin(), concat->elements.end() ) ) - { - it ++; - continue; - } - - //FIXME: int cast - auto it2 = it; - advance( it2, - (int)concat->elements.size( ) ); - - if( regexp::RegExpEpsilon::languageContainsEpsilon(*concat) && - distance( concat->elements.begin( ), concat->elements.end( )) == distance ( it2, node->elements.end( ) ) && - equal( concat->elements.begin( ), concat->elements.end( ), it2, [] ( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a == *b; } ) ) - { - optimized = true; - - for( auto delIt = it2 ; delIt != it ; delIt ++ ) - delete *delIt; - it = node->elements.erase( it2, it ); - } - else - { - it ++; - } - } - // else - else - { - if( it == node->elements.begin( ) ) - { - it++; - continue; - } - - auto prev = std::prev( it ); - - if( regexp::RegExpEpsilon::languageContainsEpsilon(*(iter->element)) && *( iter->element ) == **prev ) - { - delete * prev; - it = node->elements.erase( prev ); - optimized = true; - - // in case xxx*, we need to stay on the iter element, not to go behind it - if( it != node->elements.begin( ) ) - it = std::prev( it ); - } - else - { - it ++; - } - } - } - - return optimized; -} - -/** - * optimization V9: (xy)*x = x(yx)* - * @param node UnboundedRegExpConcatenation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V9( UnboundedRegExpConcatenation * const & node ) -{ - bool optimized = false; - - // interpretation: if concat (C1) with iter && iteration's element is concat (C2), then: - // simultaneously iterate through C1 and C2. (axy)*axz=ax(yax)*z -> get ax that is same and relocate them... - - for( auto it = node->elements.begin( ) ; it != node->elements.end( ) ; ) - { - UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); - if ( ! iter ) - { - it++; - continue; - } - UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); - if( ! concat ) - { - it++; - continue; - } - - // find range from <it+1;sth> and <concat.begin;sth> that is equal - auto c1Iter = std::next( it ), c2Iter = concat->elements.begin( ); - while( c1Iter != node->elements.end() && c2Iter != concat->elements.end( ) && **c1Iter == ** c2Iter ) - { - c1Iter ++; - c2Iter ++; - } - - if( c1Iter == std::next( it ) ) - { - it ++; - continue; - } - - // std::cout << "xy" << std::endl; - // UnboundedRegExpConcatenation* tmp = new UnboundedRegExpConcatenation( ); - // tmp->elements.insert( tmp->elements.end( ), std::next( it ), c1Iter ); - // std::cout << RegExp( tmp ) << std::endl; - - // copy the range <it;sth>, delete it and go back to the iter node - std::vector<UnboundedRegExpElement*> copyRange; - copyRange.insert( copyRange.end(), std::next( it ), c1Iter ); - it = node->elements.erase( std::next( it ), c1Iter ); - it = std::prev( it ); - - // insert that range before it position - node->elements.insert( it, copyRange.begin( ), copyRange.end( ) ); - - // alter the iteration's concat node - copyRange.clear( ); - copyRange.insert( copyRange.end(), concat->elements.begin( ), c2Iter ); - concat->elements.erase( concat->elements.begin( ), c2Iter ); - concat->elements.insert( concat->elements.end(), copyRange.begin( ), copyRange.end( ) ); - } - - return optimized; -} - -/** - * optimization V10: (x+y)* = (x*+y*)* - * @param node UnboundedRegExpIteration node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::V10( UnboundedRegExpIteration * const & node ) -{ - // interpretation: if iter's child is alternation where its every child is iteration, then they do not have to be iteration - UnboundedRegExpAlternation* alt = dynamic_cast<UnboundedRegExpAlternation*>( node->element ); - if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) - return false; - - UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); - - for( const auto & n : alt->elements ) - { - UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); - newAlt->elements.push_back( iter->element ); - iter->element = NULL; - } - - node->element = optimize( newAlt ); - delete alt; - delete newAlt; - - return true; -} - -/** - * optimization X1: a* + \e = a* - * @param node UnboundedRegExpAlternation node - * @return bool true if optimization applied else false - */ -bool RegExpOptimize::X1( UnboundedRegExpAlternation * const & node ) -{ - // theorem: In regexp like a* + \e, \e is described twice, first in a*, second in \e. - // therefore we can delete the \e as it is redundant - - auto iter = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpIteration const * const>( a );} ); - auto eps = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpEpsilon const * const>( a );} ); - - if( iter != node->elements.end( ) && eps != node->elements.end( ) ) - { - delete *eps; - node->elements.erase( eps ); - return true; - } - - return false; -} +#include "RegExpOptimizeUnboundedPart.cxx" +#include "RegExpOptimizeFormalPart.cxx" } diff --git a/alib2algo/src/regexp/RegExpOptimize.h b/alib2algo/src/regexp/RegExpOptimize.h index 56fe7ea06c..eb0fb7e135 100644 --- a/alib2algo/src/regexp/RegExpOptimize.h +++ b/alib2algo/src/regexp/RegExpOptimize.h @@ -70,6 +70,8 @@ public: regexp::FormalRegExp optimize( const regexp::FormalRegExp & regexp ); void optimize( regexp::FormalRegExpElement & regexp ); private: + regexp::FormalRegExpElement * optimize( regexp::FormalRegExpElement const * const & node ); + regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpElement const * const & node ); regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpAlternation const * const & node ); regexp::UnboundedRegExpElement * optimize( regexp::UnboundedRegExpConcatenation const * const & node ); @@ -104,6 +106,30 @@ private: bool V10( regexp::UnboundedRegExpIteration * const & node ); bool X1( regexp::UnboundedRegExpAlternation * const & node ); + + bool S( regexp::FormalRegExpElement * & node ); + bool A1( regexp::FormalRegExpElement * & node ); + bool A2( regexp::FormalRegExpElement * & node ); + bool A3( regexp::FormalRegExpElement * & node ); + bool A4( regexp::FormalRegExpElement * & node ); + bool A5( regexp::FormalRegExpElement * & node ); + bool A6( regexp::FormalRegExpElement * & node ); + bool A7( regexp::FormalRegExpElement * & node ); + bool A8( regexp::FormalRegExpElement * & node ); + bool A9( regexp::FormalRegExpElement * & node ); + bool A10( regexp::FormalRegExpElement * & node ); + bool A11( regexp::FormalRegExpElement * & node ); + bool V1( regexp::FormalRegExpElement * & node ); + bool V2( regexp::FormalRegExpElement * & node ); + bool V3( regexp::FormalRegExpElement * & node ); + bool V4( regexp::FormalRegExpElement * & node ); + bool V5( regexp::FormalRegExpElement * & node ); + bool V6( regexp::FormalRegExpElement * & node ); + bool V8( regexp::FormalRegExpElement * & node ); + bool V9( regexp::FormalRegExpElement * & node ); + bool V10( regexp::FormalRegExpElement * & node ); + + bool X1( regexp::FormalRegExpElement * & node ); }; } diff --git a/alib2algo/src/regexp/RegExpOptimizeFormalPart.cxx b/alib2algo/src/regexp/RegExpOptimizeFormalPart.cxx new file mode 100644 index 0000000000..9fc1361f41 --- /dev/null +++ b/alib2algo/src/regexp/RegExpOptimizeFormalPart.cxx @@ -0,0 +1,570 @@ +FormalRegExpElement* RegExpOptimize::optimize( FormalRegExpElement const * const & node ) +{ + FormalRegExpElement* elem = node->clone(); + + // optimize while you can + while( A1( elem ) || A2( elem ) || A3( elem ) || A4( elem ) || A10( elem ) || V2( elem ) || V5( elem ) || V6( elem ) || X1( elem ) + || A5( elem ) || A6( elem ) || A7( elem ) || A8( elem ) || A9( elem ) || V8( elem ) //|| V9( elem ) + || A11( elem ) || V1( elem ) || V3( elem ) || V4( elem ) || V10( elem ) || S(elem) ); + + return elem; +} + +bool RegExpOptimize::S( FormalRegExpElement * & node ) +{ + bool optimized = false; + FormalRegExpAlternation * alternation = dynamic_cast<FormalRegExpAlternation*>( node ); + if( alternation ) { + auto tmp = alternation->left; + alternation->left = optimize(alternation->left); + if(tmp != alternation->left) optimized = true; + + tmp = alternation->right; + alternation->right = optimize(alternation->right); + if(tmp != alternation->right) optimized = true; + + return optimized; + } + + FormalRegExpConcatenation * concatenation = dynamic_cast<FormalRegExpConcatenation*>( node ); + if( concatenation ) { + auto tmp = concatenation->left; + concatenation->left = optimize(concatenation->left); + if(tmp != concatenation->left) optimized = true; + + tmp = concatenation->right; + concatenation->right = optimize(concatenation->right); + if(tmp != concatenation->right) optimized = true; + + return optimized; + } + + FormalRegExpIteration * iteration = dynamic_cast<FormalRegExpIteration*>( node ); + if( iteration ) { + auto tmp = iteration->element; + iteration->element = optimize(iteration->element); + if(tmp != iteration->element) optimized = true; + return iteration; + } + + return optimized; +} + + +/** + * optimization A1: ( x + y ) + z = x + ( y + z ) + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A1( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpAlternation * leftAlt = dynamic_cast<FormalRegExpAlternation *>( node->left ); + + if( leftAlt ) { + FormalRegExpElement * x = leftAlt->left; + FormalRegExpElement * y = leftAlt->right; + FormalRegExpElement * z = node->right; + + node->left = x; + node->right = leftAlt; + leftAlt->left = y; + leftAlt->right = z; + + return true; + } + + return false; +} + +/** + * optimization A2: x + y = y + x (sort) + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A2( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpAlternation * rightAlt = dynamic_cast<FormalRegExpAlternation *>( node->right ); + + if( rightAlt ) { + FormalRegExpElement * x = node->left; + FormalRegExpElement * y = rightAlt->left; + + if(*x > *y) { + node->left = y; + rightAlt->left = x; + } else { + return false; + } + } + + return false; +} + +/** + * optimization A3: x + \0 = x + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A3( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + // input can be \0 + \0, so at least one element must be preserved + + FormalRegExpEmpty * rightEmp = dynamic_cast<FormalRegExpEmpty *>( node->right ); + if( rightEmp ) { + delete rightEmp; + n = node->left; + node->left = NULL; + delete node; + return true; + } + + FormalRegExpEmpty * leftEmp = dynamic_cast<FormalRegExpEmpty *>( node->left ); + if( leftEmp ) { + delete leftEmp; + n = node->right; + node->right = NULL; + delete node; + return true; + } + + return false; +} + +/** + * optimization A4: x + x = x + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A4( FormalRegExpElement * & n ) +{ + /* + * two ways of implementing this opitimization: + * - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts... + * - check every element against other ( O(n*n) ) + * + * As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2 + */ + + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + if( node->left == node->right ) { + delete node->left; + n = node->right; + node->right = NULL; + delete node; + return true; + } + + return false; +} + +/** + * optimization A5: x.(y.z) = (x.y).z = x.y.z + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A5( FormalRegExpElement * & n ) +{ + FormalRegExpConcatenation * node = dynamic_cast<FormalRegExpConcatenation *>( n ); + if( ! node ) return false; + + FormalRegExpConcatenation * leftCon = dynamic_cast<FormalRegExpConcatenation *>( node->left ); + + if( leftCon ) { + FormalRegExpElement * x = leftCon->left; + FormalRegExpElement * y = leftCon->right; + FormalRegExpElement * z = node->right; + + node->left = x; + node->right = leftCon; + leftCon->left = y; + leftCon->right = z; + + return true; + } + + return false; +} + +/** + * optimization A6: \e.x = x.\e = x + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A6( FormalRegExpElement * & n ) +{ + FormalRegExpConcatenation * node = dynamic_cast<FormalRegExpConcatenation *>( n ); + if( ! node ) return false; + + // input can be \e + \e, so at least one element must be preserved + + FormalRegExpEpsilon * rightEmp = dynamic_cast<FormalRegExpEpsilon *>( node->right ); + if( rightEmp ) { + delete rightEmp; + n = node->left; + node->left = NULL; + delete node; + return true; + } + + FormalRegExpEpsilon * leftEmp = dynamic_cast<FormalRegExpEpsilon *>( node->left ); + if( leftEmp ) { + delete leftEmp; + n = node->right; + node->right = NULL; + delete node; + return true; + } + + return false; +} + +/** + * optimization A7: \0.x = x.\0 = \0 + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A7( FormalRegExpElement * & n ) +{ + FormalRegExpConcatenation * node = dynamic_cast<FormalRegExpConcatenation *>( n ); + if( ! node ) return false; + + if( dynamic_cast<FormalRegExpEmpty *>( node->right ) || dynamic_cast<FormalRegExpEmpty *>( node->left ) ) { + delete node; + n = new FormalRegExpEmpty { }; + return true; + } + + return false; +} + +/** + * optimization A8: x.(y+z) = x.y + x.z + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A8( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization A9: (x+y).z = x.z + y.z + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A9( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization A10: x* = \e + x*x + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A10( FormalRegExpElement * & n ) +{ + /* + * problem: + * - \e + x*x = x* + * - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node). + */ + + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpEpsilon * leftEps = dynamic_cast<FormalRegExpEpsilon *>( node->left ); + if( leftEps ) { + FormalRegExpConcatenation * rightCon = dynamic_cast<FormalRegExpConcatenation *>( node->right ); + if( ! rightCon ) return false; + + FormalRegExpIteration * rightLeftIte = dynamic_cast<FormalRegExpIteration *>( rightCon->left ); + if( rightLeftIte ) { + if(rightLeftIte->element == rightCon->right) { + delete leftEps; + delete rightCon->right; + n = rightCon->left; + rightCon->left = NULL; + delete node; + return true; + } + } + + FormalRegExpIteration * rightRightIte = dynamic_cast<FormalRegExpIteration *>( rightCon->right ); + if( rightRightIte ) { + if(rightLeftIte->element == rightCon->left) { + delete leftEps; + delete rightCon->left; + n = rightCon->right; + rightCon->right = NULL; + delete node; + return true; + } + } + } + + FormalRegExpEpsilon * rightEps = dynamic_cast<FormalRegExpEpsilon *>( node->right ); + if( rightEps ) { + FormalRegExpConcatenation * leftCon = dynamic_cast<FormalRegExpConcatenation *>( node->left ); + if( ! leftCon ) return false; + + FormalRegExpIteration * leftLeftIte = dynamic_cast<FormalRegExpIteration *>( leftCon->left ); + if( leftLeftIte ) { + if(leftLeftIte->element == leftCon->right) { + delete rightEps; + delete leftCon->right; + n = leftCon->left; + leftCon->left = NULL; + delete node; + return true; + } + } + + FormalRegExpIteration * leftRightIte = dynamic_cast<FormalRegExpIteration *>( leftCon->right ); + if( leftRightIte ) { + if(leftLeftIte->element == leftCon->left) { + delete rightEps; + delete leftCon->left; + n = leftCon->right; + leftCon->right = NULL; + delete node; + return true; + } + } + } + + return false; +} + +/** + * optimization A11: x* = (\e + x)* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A11( FormalRegExpElement * & n ) +{ + FormalRegExpIteration * node = dynamic_cast<FormalRegExpIteration *>( n ); + if( ! node ) return false; + + FormalRegExpAlternation * childAlt = dynamic_cast<FormalRegExpAlternation *>( node->element ); + if( childAlt ) + { + if(dynamic_cast<FormalRegExpEpsilon*>(childAlt->left)) { + node->element = childAlt->right; + childAlt->right = NULL; + delete childAlt; + return true; + } + if(dynamic_cast<FormalRegExpEpsilon*>(childAlt->right)) { + node->element = childAlt->left; + childAlt->left = NULL; + delete childAlt; + return true; + } + } + + return false; +} + +/** + * optimization V1: \0* = \e + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V1( FormalRegExpElement * & n ) +{ + FormalRegExpIteration * node = dynamic_cast<FormalRegExpIteration *>( n ); + if( ! node ) return false; + + if( dynamic_cast<FormalRegExpEmpty*>( node->element ) ) + { + delete node; + n = new FormalRegExpEpsilon( ); + return true; + } + return false; +} + +/** + * optimization V2: x* + x = x* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V2( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpIteration * leftIte = dynamic_cast<FormalRegExpIteration *>( node->left ); + if( leftIte ) { + if(leftIte->element == node->right) { + n = node->left; + node->left = NULL; + delete node; + return true; + } + } + + FormalRegExpIteration * rightIte = dynamic_cast<FormalRegExpIteration *>( node->right ); + if( rightIte ) { + if(rightIte->element == node->left) { + n = node->right; + node->right = NULL; + delete node; + return true; + } + } + + return false; +} + +/** + * optimization V3: x** = x* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V3( FormalRegExpElement * & n ) +{ + FormalRegExpIteration * node = dynamic_cast<FormalRegExpIteration *>( n ); + if( ! node ) return false; + + FormalRegExpIteration* childIter = dynamic_cast<FormalRegExpIteration*>( node->element ); + if( childIter ) + { + node->element = childIter->element; + childIter->element = NULL; + delete childIter; + + return true; + } + + return false; +} + +/** + * optimization V4: (x+y)* = (x*y*)* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V4( FormalRegExpElement * & n ) +{ + FormalRegExpConcatenation * node = dynamic_cast<FormalRegExpConcatenation *>( n ); + if( ! node ) return false; + + FormalRegExpIteration * leftIte = dynamic_cast<FormalRegExpIteration *>( node->left ); + if( ! leftIte ) return false; + + FormalRegExpIteration * rightIte = dynamic_cast<FormalRegExpIteration *>( node->right ); + if( ! rightIte ) return false; + + n = new FormalRegExpAlternation(std::move( *leftIte->element ), std::move(*rightIte->element)); + + delete node; + return true; +} + +/** + * optimization V5: x*y = y + x*xy + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V5( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization V6: x*y = y + xx*y + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V6( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization V8: \e in h(x) => xx*=x* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V8( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization V9: (xy)*x = x(yx)* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V9( FormalRegExpElement * & n ) +{ + return false; +} + +/** + * optimization V10: (x+y)* = (x*+y*)* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V10( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpIteration * leftIte = dynamic_cast<FormalRegExpIteration *>( node->left ); + if( ! leftIte ) return false; + + FormalRegExpIteration * rightIte = dynamic_cast<FormalRegExpIteration *>( node->right ); + if( ! rightIte ) return false; + + n = new FormalRegExpConcatenation(std::move( *leftIte->element ), std::move(*rightIte->element)); + + delete node; + return true; +} + +/** + * optimization X1: a* + \e = a* + * @param node FormalRegExpElement node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::X1( FormalRegExpElement * & n ) +{ + FormalRegExpAlternation * node = dynamic_cast<FormalRegExpAlternation *>( n ); + if( ! node ) return false; + + FormalRegExpIteration * leftIte = dynamic_cast<FormalRegExpIteration *>( node->left ); + if( leftIte ) { + if(dynamic_cast<FormalRegExpEpsilon*>(node->right)) { + n = node->left; + node->left = NULL; + delete node; + return true; + } + } + + FormalRegExpIteration * rightIte = dynamic_cast<FormalRegExpIteration *>( node->right ); + if( rightIte ) { + if(dynamic_cast<FormalRegExpEpsilon*>(node->left)) { + n = node->right; + node->right = NULL; + delete node; + return true; + } + } + + return false; +} diff --git a/alib2algo/src/regexp/RegExpOptimizeUnboundedPart.cxx b/alib2algo/src/regexp/RegExpOptimizeUnboundedPart.cxx new file mode 100644 index 0000000000..f375cbadfa --- /dev/null +++ b/alib2algo/src/regexp/RegExpOptimizeUnboundedPart.cxx @@ -0,0 +1,1162 @@ +UnboundedRegExpElement* RegExpOptimize::optimize( UnboundedRegExpElement const * const & node ) +{ + const UnboundedRegExpAlternation * alternation = dynamic_cast<const UnboundedRegExpAlternation*>( node ); + if( alternation ) + return optimize( alternation ); + + const UnboundedRegExpConcatenation * concatenation = dynamic_cast<const UnboundedRegExpConcatenation*>( node ); + if( concatenation ) + return optimize( concatenation ); + + const UnboundedRegExpIteration * iteration = dynamic_cast<const UnboundedRegExpIteration*>( node ); + if( iteration ) + return optimize( iteration ); + + const UnboundedRegExpSymbol * symbol = dynamic_cast<const UnboundedRegExpSymbol*>( node ); + if( symbol ) + return optimize( symbol ); + + const UnboundedRegExpEmpty * empty= dynamic_cast<const UnboundedRegExpEmpty*>( node ); + if( empty ) + return optimize( empty ); + + const UnboundedRegExpEpsilon * eps = dynamic_cast<const UnboundedRegExpEpsilon*>( node ); + if( eps ) + return optimize( eps ); + + throw exception::AlibException( "RegExpOptimize::optimize - unknown UnboundedRegExpElement node" ); +} + + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpAlternation const * const & node ) +{ + UnboundedRegExpAlternation* alt = new UnboundedRegExpAlternation( ); + + for( const auto & child : node->elements ) + alt->elements.push_back( optimize( child ) ); + + // optimize while you can + while( A1( alt ) || A2( alt ) || A3( alt ) || A4( alt ) || A10( alt ) || V2( alt ) || V5( alt ) || V6( alt ) || X1( alt ) ); + + if( alt->elements.size( ) == 1 ) + { + UnboundedRegExpElement* ret = alt->elements.front( ); + alt->elements.clear( ); + delete alt; + return ret; + } + + if( alt->elements.size( ) == 0 ) { + delete alt; + return new UnboundedRegExpEmpty( ); + } + + return alt; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpConcatenation const * const & node ) +{ + UnboundedRegExpConcatenation* concat = new UnboundedRegExpConcatenation( ); + + for( const auto & child : node->elements ) + concat->elements.push_back( optimize( child ) ); + + while( A5( concat ) || A6( concat ) || A7( concat ) || A8( concat ) || A9( concat ) || V8( concat ) );//|| V9( concat ) ); + + if( concat->elements.size( ) == 1 ) + { + UnboundedRegExpElement* ret = concat->elements.front( ); + concat->elements.clear( ); + delete concat; + return ret; + } + + if( concat->elements.size( ) == 0 ) { + delete concat; + return new UnboundedRegExpEpsilon( ); + } + + return concat; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpIteration const * const & node ) +{ + UnboundedRegExpIteration* iter = new UnboundedRegExpIteration(* optimize( node->element ) ); + + do + { + // V1 is implemented right here + if( dynamic_cast<UnboundedRegExpEmpty*>( iter->element ) ) + { + delete iter; + return new UnboundedRegExpEpsilon( ); + } + } + while( A11( iter ) || V1( iter ) || V3( iter ) || V4( iter ) || V10( iter ) ); + + return iter; +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpSymbol const * const & node ) +{ + return node->clone( ); +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEmpty const * const & node ) +{ + return node->clone( ); +} + +UnboundedRegExpElement * RegExpOptimize::optimize( UnboundedRegExpEpsilon const * const & node ) +{ + return node->clone( ); +} + +/** + * optimization A1: x + ( y + z ) = ( x + y ) + z = x + y + z + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A1( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpAlternation * const & childUnboundedRegExpAlternation = dynamic_cast<UnboundedRegExpAlternation *>( * it ); + + if( childUnboundedRegExpAlternation ) + { + it = node->elements.erase( it ); + + size_t off = it - node->elements.begin(); + node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); + it = node->elements.begin() + off; + + //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpAlternation->elements.begin( ), childUnboundedRegExpAlternation->elements.end( ) ); + + childUnboundedRegExpAlternation->elements.clear( ); + delete childUnboundedRegExpAlternation; + + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A2: x + y = y + x (sort) + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A2( UnboundedRegExpAlternation * const & node ) +{ + std::function<bool( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b )> cmp = [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a < *b; }; + + if( std::is_sorted( node->elements.begin( ), node->elements.end( ), cmp ) ) + return false; + + std::sort( node->elements.begin(), node->elements.end(), cmp ); + return true; +} + +/** + * optimization A3: x + \0 = x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A3( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // input can be \0 + \0, so at least one element must be preserved + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpEmpty const * const & empty = dynamic_cast<UnboundedRegExpEmpty const *>( * it ); + + if( empty && node->elements.size( ) > 1 ) + { + it = node->elements.erase( it ); + delete empty; + + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A4: x + x = x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A4( UnboundedRegExpAlternation * const & node ) +{ + /* + * two ways of implementing this opitimization: + * - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts... + * - check every element against other ( O(n*n) ) + * + * As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2 + */ + + bool optimized = false; + if(node->elements.size() != 0) for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + if ( ** it == ** std::prev( it ) ) + { + delete * it; + it = node->elements.erase( it ); + optimized = true; + } + else + { + it ++; + } + } + + return optimized; +} + +/** + * optimization A5: x.(y.z) = (x.y).z = x.y.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A5( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * const & childUnboundedRegExpConcatenation = dynamic_cast<UnboundedRegExpConcatenation *>( * it ); + + if( childUnboundedRegExpConcatenation ) + { + it = node->elements.erase( it ); + + size_t off = it - node->elements.begin(); + node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); + it = node->elements.begin() + off; + + //TODO on g++-4.9 use: it = node->elements.insert( it, childUnboundedRegExpConcatenation->elements.begin( ), childUnboundedRegExpConcatenation->elements.end( ) ); + + childUnboundedRegExpConcatenation->elements.clear( ); + delete childUnboundedRegExpConcatenation; + + optimized = true; + } + else + it ++; + } + + return optimized; +} + +/** + * optimization A6: \e.x = x.\e = x + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A6( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + UnboundedRegExpEpsilon* epsilon = dynamic_cast<UnboundedRegExpEpsilon*>( * it ); + if( epsilon && node->elements.size( ) > 1 ) + { + delete * it; + it = node->elements.erase( it ); + + optimized = true; + } + else + it ++; + } + + return optimized; +} + +/** + * optimization A7: \0.x = x.\0 = \0 + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A7( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + if( std::any_of( node->elements.begin( ), node->elements.end( ), []( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpEmpty const *>( a ); } ) ) + { + if(node->elements.size() == 1) return false; + + for( auto const& child : node->elements ) + delete child; + + node->elements.clear( ); + node->elements.push_back( new UnboundedRegExpEmpty( ) ); + + optimized = true; + } + + return optimized; +} + +/** + * optimization A8: x.(y+z) = x.y + x.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A8( UnboundedRegExpConcatenation * const & node ) +{ +/* + bool optimized = false; + + for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); + if( ! alt ) + { + it ++; + continue; + } + + // take everything to the left and copy it as prefix of every element in alternation. + UnboundedRegExpConcatenation * leftPart = new UnboundedRegExpConcatenation( ); + leftPart->elements.insert( leftPart->elements.end( ), node->elements.begin( ), it ); + + for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) + { + UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); + altElem->elements.push_back( leftPart->clone( ) ); + altElem->elements.push_back( * altIt ); + + * altIt = altElem; + } + + UnboundedRegExpElement * optIt = optimize( * it ); + delete *it; + *it = optIt; + + delete leftPart; + it = node->elements.erase( node->elements.begin( ), it ); + + optimized = true; + it ++; + } + + return optimized; +*/ + return false; +} + +/** + * optimization A9: (x+y).z = x.z + y.z + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A9( UnboundedRegExpConcatenation * const & node ) +{ +/* + bool optimized = false; + + for( auto it = node->elements.begin( ); it != std::prev( node->elements.end( ) ); ) + { + UnboundedRegExpAlternation * alt = dynamic_cast<UnboundedRegExpAlternation*>( * it ); + if( ! alt ) + { + it ++; + continue; + } + + // take everything to the right and copy it as suffix of every element in alternation. + UnboundedRegExpConcatenation * rest = new UnboundedRegExpConcatenation( ); + rest->elements.insert( rest->elements.end( ), std::next( it ), node->elements.end( ) ); + + for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) + { + UnboundedRegExpConcatenation * altElem = new UnboundedRegExpConcatenation( ); + altElem->elements.push_back( * altIt ); + altElem->elements.push_back( rest->clone( ) ); + + * altIt = altElem; + } + + UnboundedRegExpElement * optIt = optimize( * it ); + delete *it; + *it = optIt; + + delete rest; + it = node->elements.erase( std::next( it ), node->elements.end( ) ); + optimized = true; + + // as we move (delete) the rest of this expression, it surely wont do another round. More optimizations to be performerd are in subtree now. + // we do not care about this here as method optimize(UnboundedRegExpAlternation) will take care of this in next iteration + // it ++; + break; + } + + return optimized; +*/ + return false; +} + +/** + * optimization A10: x* = \e + x*x + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A10( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false, optimizedIter = false; + + /* + * problem: + * - \e + x*x = x* + * - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node). + */ + + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + optimizedIter = false; + + // check if we have some epsilon or iteration left, else nothing to do + auto eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ) || dynamic_cast<UnboundedRegExpIteration const*>( a ); + }); + if( eps == node->elements.end( ) ) + break; + + UnboundedRegExpConcatenation const * const & childConcat = dynamic_cast<UnboundedRegExpConcatenation const *>( *it ); + if( childConcat ) + { + // if iteration is first element of concatenation + UnboundedRegExpIteration const * const & iter = dynamic_cast<UnboundedRegExpIteration const *>( childConcat->elements.front( ) ); + + if( iter ) + { + // concatenation without the iteration node + UnboundedRegExpConcatenation *tmpConcat = dynamic_cast<UnboundedRegExpConcatenation *>( childConcat->clone( ) ); + delete tmpConcat->elements.front( ); + tmpConcat->elements.erase( tmpConcat->elements.begin( ) ); + UnboundedRegExpElement * tmpConcatOpt = optimize( tmpConcat ); + + // check if iteration element is the same subtree as rest of concatenation + if( * iter->element == * tmpConcatOpt ) + { + optimized = optimizedIter = true; + + size_t off = it - node->elements.begin(); + node->elements.push_back( iter->clone( ) ); + it = node->elements.begin() + off; + + delete childConcat; + it = node->elements.erase( it ); + + // find the eps again - invalidated after prev erase + eps = find_if( node->elements.begin( ), node->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); + }); + // if it was eps, delete it + // if it was not the eps but iteration, keep it + if( eps != node->elements.end( ) ) + { + delete *eps; + it = node->elements.erase( eps ); + } + } + delete tmpConcat; + delete tmpConcatOpt; + } + } + + if( ! optimizedIter ) + it ++; + } + + return optimized; +} + +/** + * optimization A11: x* = (\e + x)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::A11( UnboundedRegExpIteration * const & node ) +{ + bool optimized = false; + + UnboundedRegExpAlternation * const & childAlt = dynamic_cast<UnboundedRegExpAlternation *>( node->element ); + + if( childAlt ) + { + // check if eps inside iteration's alternation + auto eps = find_if( childAlt->elements.begin( ), childAlt->elements.end( ), [ ]( UnboundedRegExpElement const * const & a ) -> bool { + return dynamic_cast<UnboundedRegExpEpsilon const *>( a ); + }); + + // if no eps + if( eps == childAlt->elements.end( ) ) + return false; + + // remove eps from alternation + optimized = true; + delete * eps; + childAlt->elements.erase( eps ); + } + + return optimized; +} + +/** + * optimization V1: \0* = \e + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V1( UnboundedRegExpIteration * const & node ) +{ + // implemented in optimize( UnboundedRegExpIteration ) + + return false; +} + +/** + * optimization V2: x* + x = x* + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V2( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + /* + * Bit tricky + * We need also to cover the cases like (a+b)* + a + b + c = (a+b)* + c + */ + + std::list<UnboundedRegExpElement*> iterElements; + // cache iter elements because of operator invalidation after erase + for( const auto & n : node->elements ) + { + UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + if( iter ) + iterElements.push_back( iter->element ); + } + + for( const auto & n : iterElements ) + { + // if alternation is inside, we need to make sure that every element of alternation is inside node->elements. if so, delete them all + UnboundedRegExpAlternation * tmpAlt = dynamic_cast<UnboundedRegExpAlternation*>( n ); + if( tmpAlt ) + { + bool every = true; + for( const auto & altElem : tmpAlt->elements ) + { + auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { + return *a == *altElem; + }); + + if( it == node->elements.end( ) ) + every = false; + } + + if ( every == true ) + { + optimized = true; + + for( const auto & altElem : tmpAlt->elements ) + { + auto it = find_if( node->elements.begin( ), node->elements.end( ), [ altElem ]( UnboundedRegExpElement const * const & a ) -> bool { + return *a == *altElem; + }); + assert( it != node->elements.end( ) ); + + delete *it; + node->elements.erase( it ); + } + } + } + + // else + for( auto it = node->elements.begin( ); it != node->elements.end( ); ) + { + if( *n == **it ) + { + optimized = true; + + delete *it; + it = node->elements.erase( it ); + } + else + { + it ++; + } + } + } + + return optimized; +} + +/** + * optimization V3: x** = x* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V3( UnboundedRegExpIteration * const & node ) +{ + UnboundedRegExpIteration* childIter = dynamic_cast<UnboundedRegExpIteration*>( node->element ); + if( childIter ) + { + node->element = childIter->element; + childIter->element = NULL; + delete childIter; + + return true; + } + + return false; +} + +/** + * optimization V4: (x+y)* = (x*y*)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V4( UnboundedRegExpIteration * const & node ) +{ + // interpretation: if iteration's element is concat and every concat's element is iteration + UnboundedRegExpConcatenation* alt = dynamic_cast<UnboundedRegExpConcatenation*>( node->element ); + if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) + return false; + + UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); + + for( const auto & n : alt->elements ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + newAlt->elements.push_back( iter->element ); + iter->element = NULL; + } + + node->element = optimize( newAlt ); + delete alt; + delete newAlt; + + return true; +} + +/** + * optimization V5: x*y = y + x*xy + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V5( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // reinterpretation: ax*y = ay+ax*xy + // so, if we find iter, a = everything that is before it (prefix) + // x = iter's content + // behind iter must be exactly iter's content + // y = rest (suffix) + // prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix + + for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); + if( ! concat ) + { + itA ++; + continue; + } + + for( auto itC = concat->elements.begin( ); itC != std::prev( concat->elements.end( ) ); ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( *itC ); + if( ! iter ) + { + itC ++; + continue; + } + + // iteration's element must follow the iteration (x*x) + auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy + + // if iter's element is concat + if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) + { + UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + // std::cout << "....." << std::endl; + // std::cout << RegExp( concat ) << std::endl; + // std::cout << RegExp( iterConcat ) << std::endl; + // UnboundedRegExpConcatenation * tmp = new UnboundedRegExpConcatenation( ); + // tmp->elements.insert( tmp->elements.end( ), std::next( itC ), concat->elements.end( ) ); + // std::cout << RegExp( tmp) << std::endl; + + if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( std::next( itC ), concat->elements.end( ) ) + || ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), std::next( itC ), + [ ]( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) + { + itC++; + continue; + } + advance( itStartY, (int)iterConcat->elements.size( ) ); + } + // else + else + { + if( * iter->element != ** std::next( itC ) ) + { + itC ++; + continue; + } + + advance( itStartY, 1 ); + } + + // store everything before iteration as "a" + UnboundedRegExpElement * regexpA; + if( concat->elements.begin( ) == itC ) + { + regexpA = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation * tmpA = new UnboundedRegExpConcatenation( ); + tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itC ); + regexpA = optimize( tmpA ); + tmpA->elements.clear( ); + delete tmpA; + } + + // store everything behind iteration's followup element as "y" + UnboundedRegExpElement * regexpY; + if( itStartY == concat->elements.end( ) ) + { + regexpY = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); + tmpY->elements.insert( tmpY->elements.end( ), itStartY, concat->elements.end( ) ); + regexpY = optimize( tmpY ); + tmpY->elements.clear( ); + delete tmpY; + } + + // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) + UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); + tmpAY->elements.push_back( regexpA ); + tmpAY->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAY = optimize( tmpAY ); + tmpAY->elements.clear( ); + delete tmpAY; + + auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); + if( iterAY == node->elements.end( ) ) + { + itC ++; + continue; + } + + // if AY exists, then we can simply do this: + //iterator invalidated, need to backup concat node + UnboundedRegExpElement * tmpItA = *itA; + + delete *iterAY; + node->elements.erase( iterAY ); + + // iterator invalidated, need to recall before erase + itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); + + UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); + tmpAltered->elements.push_back( regexpA ); + tmpAltered->elements.push_back( * itC ); + tmpAltered->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); + + tmpAltered->elements.clear( ); + delete tmpAltered; + + delete regexpA; + delete regexpY; + delete regexpAY; + + delete *itA; + itA = node->elements.erase( itA ); + + node->elements.insert( itA, regexpAltered ); + + optimized = true; + break; + } + + itA ++; + } + + return optimized; +} + +/** + * optimization V6: x*y = y + xx*y + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V6( UnboundedRegExpAlternation * const & node ) +{ + bool optimized = false; + + // reinterpretation: ax*y = ay+axx*y + // so, if we find iter, a = everything that is before it (prefix) + // x = iter's content + // before iter must be exactly iter's content + // y = rest (suffix) + // prefix.xx*.suffix + prefix.suffix = prefix.x*.suffix + + for( auto itA = node->elements.begin( ); itA != node->elements.end( ); ) + { + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( * itA ); + if( ! concat ) + { + itA ++; + continue; + } + + for( auto itC = std::next( concat->elements.begin( ) ); itC != concat->elements.end( ); ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * itC ); + if( ! iter ) + { + itC ++; + continue; + } + + // iteration's element must preceed the iteration (xx*) + auto itStartX = itC; //itStartX points to first x in expression xx*, everything before is therefore prefix - regexp "a" + + // if iter's element is concat + if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ) ) + { + UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + if( distance( concat->elements.begin( ), itC ) < (int)iterConcat->elements.size( ) ) + { + itC ++; + continue; + } + advance( itStartX, - (int)(iterConcat->elements.size( ) ) ); + + if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( itStartX, concat->elements.end( ) ) + || + ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), itStartX, + []( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool{ return *a == *b; } ) ) + { + itC++; + continue; + } + } + // else + else + { + if( * iter->element != ** std::prev( itC ) ) + { + itC ++; + continue; + } + + advance( itStartX, -1 ); + } + + // store everything before x as "a" + UnboundedRegExpElement * regexpA; + if( concat->elements.begin( ) == itStartX ) + { + regexpA = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpA = new UnboundedRegExpConcatenation( ); + tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itStartX ); + regexpA = optimize( tmpA ); + tmpA->elements.clear( ); + delete tmpA; + } + + // store everything behind iteration's followup element as "y" + UnboundedRegExpElement * regexpY; + if( std::next( itC ) == concat->elements.end( ) ) + { + regexpY = new UnboundedRegExpEpsilon( ); + } + else + { + UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( ); + tmpY->elements.insert( tmpY->elements.end( ), std::next( itC ), concat->elements.end( ) ); + regexpY = optimize( tmpY ); + tmpY->elements.clear( ); + delete tmpY; + } + + // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements ) + UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( ); + tmpAY->elements.push_back( regexpA ); + tmpAY->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAY = optimize( tmpAY ); + tmpAY->elements.clear( ); + delete tmpAY; + + auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( UnboundedRegExpElement const * const & a ) -> bool{ return *a == *regexpAY; } ); + if( iterAY == node->elements.end( ) ) + { + itC ++; + continue; + } + + // if AY exists, then we can simply do this: + //iterator invalidated, need to backup concat node + UnboundedRegExpElement * tmpItA = *itA; + delete *iterAY; + node->elements.erase( iterAY ); + + // iterator invalidated, need to recall before erase + itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( UnboundedRegExpElement const * const & a ) -> bool { return *a == *tmpItA; } ); + + UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( ); + tmpAltered->elements.push_back( regexpA ); + tmpAltered->elements.push_back( * itC ); + tmpAltered->elements.push_back( regexpY ); + UnboundedRegExpElement * regexpAltered = optimize( tmpAltered ); + + tmpAltered->elements.clear( ); + delete tmpAltered; + + delete regexpA; + delete regexpY; + delete regexpAY; + + delete *itA; + itA = node->elements.erase( itA ); + + node->elements.insert( itA, regexpAltered ); + optimized = true; + break; + } + + itA ++; + } + + return optimized; +} + +/** + * optimization V8: \e in h(x) => xx*=x* + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V8( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // interpretation: if there is iteration in concatenation node, and element of iteration contains eps and is straight before this iteration, then this element can be omitted + + for( auto it = next( node->elements.begin( ) ); it != node->elements.end( ); ) + { + UnboundedRegExpIteration* iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); + + if( ! iter ) + { + it ++; + continue; + } + + // if element of iteration is concatenation, we need to check this specially + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + + if( concat ) + { + // check if not out of bounds + if( distance( node->elements.begin( ), it ) < distance( concat->elements.begin(), concat->elements.end() ) ) + { + it ++; + continue; + } + + //FIXME: int cast + auto it2 = it; + advance( it2, - (int)concat->elements.size( ) ); + + if( regexp::RegExpEpsilon::languageContainsEpsilon(*concat) && + distance( concat->elements.begin( ), concat->elements.end( )) == distance ( it2, node->elements.end( ) ) && + equal( concat->elements.begin( ), concat->elements.end( ), it2, [] ( UnboundedRegExpElement const * const & a, UnboundedRegExpElement const * const & b ) -> bool { return *a == *b; } ) ) + { + optimized = true; + + for( auto delIt = it2 ; delIt != it ; delIt ++ ) + delete *delIt; + it = node->elements.erase( it2, it ); + } + else + { + it ++; + } + } + // else + else + { + if( it == node->elements.begin( ) ) + { + it++; + continue; + } + + auto prev = std::prev( it ); + + if( regexp::RegExpEpsilon::languageContainsEpsilon(*(iter->element)) && *( iter->element ) == **prev ) + { + delete * prev; + it = node->elements.erase( prev ); + optimized = true; + + // in case xxx*, we need to stay on the iter element, not to go behind it + if( it != node->elements.begin( ) ) + it = std::prev( it ); + } + else + { + it ++; + } + } + } + + return optimized; +} + +/** + * optimization V9: (xy)*x = x(yx)* + * @param node UnboundedRegExpConcatenation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V9( UnboundedRegExpConcatenation * const & node ) +{ + bool optimized = false; + + // interpretation: if concat (C1) with iter && iteration's element is concat (C2), then: + // simultaneously iterate through C1 and C2. (axy)*axz=ax(yax)*z -> get ax that is same and relocate them... + + for( auto it = node->elements.begin( ) ; it != node->elements.end( ) ; ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( * it ); + if ( ! iter ) + { + it++; + continue; + } + UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element ); + if( ! concat ) + { + it++; + continue; + } + + // find range from <it+1;sth> and <concat.begin;sth> that is equal + auto c1Iter = std::next( it ), c2Iter = concat->elements.begin( ); + while( c1Iter != node->elements.end() && c2Iter != concat->elements.end( ) && **c1Iter == ** c2Iter ) + { + c1Iter ++; + c2Iter ++; + } + + if( c1Iter == std::next( it ) ) + { + it ++; + continue; + } + + // std::cout << "xy" << std::endl; + // UnboundedRegExpConcatenation* tmp = new UnboundedRegExpConcatenation( ); + // tmp->elements.insert( tmp->elements.end( ), std::next( it ), c1Iter ); + // std::cout << RegExp( tmp ) << std::endl; + + // copy the range <it;sth>, delete it and go back to the iter node + std::vector<UnboundedRegExpElement*> copyRange; + copyRange.insert( copyRange.end(), std::next( it ), c1Iter ); + it = node->elements.erase( std::next( it ), c1Iter ); + it = std::prev( it ); + + // insert that range before it position + node->elements.insert( it, copyRange.begin( ), copyRange.end( ) ); + + // alter the iteration's concat node + copyRange.clear( ); + copyRange.insert( copyRange.end(), concat->elements.begin( ), c2Iter ); + concat->elements.erase( concat->elements.begin( ), c2Iter ); + concat->elements.insert( concat->elements.end(), copyRange.begin( ), copyRange.end( ) ); + } + + return optimized; +} + +/** + * optimization V10: (x+y)* = (x*+y*)* + * @param node UnboundedRegExpIteration node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::V10( UnboundedRegExpIteration * const & node ) +{ + // interpretation: if iter's child is alternation where its every child is iteration, then they do not have to be iteration + UnboundedRegExpAlternation* alt = dynamic_cast<UnboundedRegExpAlternation*>( node->element ); + if( ! alt || ! all_of( alt->elements.begin( ), alt->elements.end( ), [] ( UnboundedRegExpElement const * const & a ) -> bool{ return dynamic_cast<UnboundedRegExpIteration const * const >( a ); } ) ) + return false; + + UnboundedRegExpAlternation * newAlt = new UnboundedRegExpAlternation( ); + + for( const auto & n : alt->elements ) + { + UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( n ); + newAlt->elements.push_back( iter->element ); + iter->element = NULL; + } + + node->element = optimize( newAlt ); + delete alt; + delete newAlt; + + return true; +} + +/** + * optimization X1: a* + \e = a* + * @param node UnboundedRegExpAlternation node + * @return bool true if optimization applied else false + */ +bool RegExpOptimize::X1( UnboundedRegExpAlternation * const & node ) +{ + // theorem: In regexp like a* + \e, \e is described twice, first in a*, second in \e. + // therefore we can delete the \e as it is redundant + + auto iter = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpIteration const * const>( a );} ); + auto eps = find_if( node->elements.begin( ), node->elements.end( ), [] (UnboundedRegExpElement const * const & a ) -> bool { return dynamic_cast<UnboundedRegExpEpsilon const * const>( a );} ); + + if( iter != node->elements.end( ) && eps != node->elements.end( ) ) + { + delete *eps; + node->elements.erase( eps ); + return true; + } + + return false; +} diff --git a/alib2data/src/regexp/formal/FormalRegExpAlternation.h b/alib2data/src/regexp/formal/FormalRegExpAlternation.h index cdc61ee511..cb53ab3f24 100644 --- a/alib2data/src/regexp/formal/FormalRegExpAlternation.h +++ b/alib2data/src/regexp/formal/FormalRegExpAlternation.h @@ -98,6 +98,8 @@ public: */ virtual void operator>>(std::ostream& out) const; + friend class RegExpOptimize; + virtual operator std::string() const; virtual int selfTypeId() const { diff --git a/alib2data/src/regexp/formal/FormalRegExpConcatenation.h b/alib2data/src/regexp/formal/FormalRegExpConcatenation.h index ecc788c3df..1f539385db 100644 --- a/alib2data/src/regexp/formal/FormalRegExpConcatenation.h +++ b/alib2data/src/regexp/formal/FormalRegExpConcatenation.h @@ -96,6 +96,8 @@ public: */ virtual void operator>>(std::ostream& out) const; + friend class RegExpOptimize; + virtual operator std::string() const; virtual int selfTypeId() const { diff --git a/alib2data/src/regexp/formal/FormalRegExpIteration.h b/alib2data/src/regexp/formal/FormalRegExpIteration.h index ad71fca8bc..aa35c9b21c 100644 --- a/alib2data/src/regexp/formal/FormalRegExpIteration.h +++ b/alib2data/src/regexp/formal/FormalRegExpIteration.h @@ -91,6 +91,8 @@ public: */ virtual void operator>>(std::ostream& out) const; + friend class RegExpOptimize; + virtual operator std::string() const; virtual int selfTypeId() const { -- GitLab