/* * RegExpOptimize.cpp * * Created on: 20. 1. 2014 * Author: Tomas Pecka */ #include <iostream> namespace regexp { namespace simplify { template < class SymbolType > void RegExpOptimize::optimize( UnboundedRegExpAlternation < SymbolType > & alt ) { for( unsigned i = 0; i < alt.getChildren ( ).size ( ); i++ ) alt.setChild ( optimizeInner ( alt.getChildren ( ) [ i ] ), i ); while ( A1( alt ) || A2( alt ) || A3( alt ) || A4( alt ) || A10( alt ) || V2( alt ) || V5( alt ) || V6( alt ) || X1( alt ) ); } template < class SymbolType > void RegExpOptimize::optimize( UnboundedRegExpConcatenation < SymbolType > & concat ) { for( unsigned i = 0; i < concat.getChildren ( ).size ( ); i++ ) concat.setChild ( optimizeInner ( concat.getChildren ( ) [ i ] ), i ); while ( A5( concat ) || A6( concat ) || A7( concat ) || A8( concat ) || A9( concat ) || V8( concat ) || V9( concat ) ); } template < class SymbolType > void RegExpOptimize::optimize( UnboundedRegExpIteration < SymbolType > & iter ) { iter.setChild ( optimizeInner ( iter.getChild ( ) ) ); while ( A11( iter ) || V1( iter ) || V3( iter ) || V4( iter ) || V10( iter ) ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpElement < SymbolType > & node ) { const UnboundedRegExpAlternation < SymbolType > * alternation = dynamic_cast<const UnboundedRegExpAlternation < SymbolType >*>( & node ); if( alternation ) return optimizeInner( * alternation ); const UnboundedRegExpConcatenation < SymbolType > * concatenation = dynamic_cast<const UnboundedRegExpConcatenation < SymbolType >*>( & node ); if( concatenation ) return optimizeInner( * concatenation ); const UnboundedRegExpIteration < SymbolType > * iteration = dynamic_cast<const UnboundedRegExpIteration < SymbolType >*>( & node ); if( iteration ) return optimizeInner( * iteration ); const UnboundedRegExpSymbol < SymbolType > * symbol = dynamic_cast<const UnboundedRegExpSymbol < SymbolType >*>( & node ); if( symbol ) return optimizeInner( * symbol ); const UnboundedRegExpEmpty < SymbolType > * empty = dynamic_cast<const UnboundedRegExpEmpty < SymbolType >*>( & node ); if( empty ) return optimizeInner( * empty ); const UnboundedRegExpEpsilon < SymbolType > * eps = dynamic_cast<const UnboundedRegExpEpsilon < SymbolType >*>( & node ); if( eps ) return optimizeInner( * eps ); throw exception::CommonException( "RegExpOptimize::optimize - unknown UnboundedRegExpElement < SymbolType > node" ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpAlternation < SymbolType > & node ) { UnboundedRegExpAlternation < SymbolType > alt; for( const UnboundedRegExpElement < SymbolType > & child : node.getElements ( ) ) alt.pushBackChild ( optimizeInner( child ) ); // optimize while you can while( A1 ( alt ) || A2 ( alt ) || A3 ( alt ) || A4 ( alt ) || A10 ( alt ) || V2 ( alt ) || V5 ( alt ) || V6 ( alt ) || X1 ( alt ) ); if( alt.getElements ( ).size( ) == 1 ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( alt.getChildren ( ).front ( ) ) ); if( alt.getElements ( ).size( ) == 0 ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEmpty < SymbolType > ( ) ); return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( alt ) ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpConcatenation < SymbolType > & node ) { UnboundedRegExpConcatenation < SymbolType > concat; for( const UnboundedRegExpElement < SymbolType > & child : node.getElements ( ) ) concat.pushBackChild ( optimizeInner( child ) ); while( A5 ( concat ) || A6 ( concat ) || A7 ( concat ) || A8 ( concat ) || A9 ( concat ) || V8 ( concat ) || V9 ( concat ) ); if( concat.getElements ( ).size( ) == 1 ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( concat.getChildren ( ).front ( ) ) ); if( concat.getElements ( ).size( ) == 0 ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) ); return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( concat ) ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpIteration < SymbolType > & node ) { UnboundedRegExpIteration < SymbolType > iter ( optimizeInner ( node.getChild ( ) ) ); while ( A11 ( iter ) || V1 ( iter ) || V3 ( iter ) || V4 ( iter ) || V10 ( iter ) ); // V1 is implemented right here if( dynamic_cast < UnboundedRegExpEmpty < SymbolType > * > ( & iter.getChild ( ) ) ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) ); // T1 is implemented right here \e* = \e if ( dynamic_cast < UnboundedRegExpEpsilon < SymbolType > * > ( & iter.getChild ( ) ) ) return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) ); return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( iter ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpSymbol < SymbolType > & node ) { return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpEmpty < SymbolType > & node ) { return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node ); } template < class SymbolType > ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpEpsilon < SymbolType > & node ) { return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node ); } /** * optimization A1: x + ( y + z ) = ( x + y ) + z = x + y + z * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A1( UnboundedRegExpAlternation < SymbolType > & node ) { bool optimized = false; for( auto it = node.begin( ); it != node.end( ); ) { if( dynamic_cast < UnboundedRegExpAlternation < SymbolType > * > ( & * it ) ) { UnboundedRegExpAlternation < SymbolType > childAlt ( static_cast < UnboundedRegExpAlternation < SymbolType > && > ( std::move ( * it ) ) ); it = node.erase( it ); it = node.insert ( it, std::make_move_iterator ( childAlt.begin ( ) ), std::make_move_iterator ( childAlt.end ( ) ) ); optimized = true; } else it ++; } return optimized; } /** * optimization A2: x + y = y + x (sort) * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A2( UnboundedRegExpAlternation < SymbolType > & node ) { auto cmp = [ ]( const UnboundedRegExpElement < SymbolType > * a, const UnboundedRegExpElement < SymbolType > * b ) -> bool { return * a < * b; }; if ( std::is_sorted ( node.begin( ).base ( ), node.end( ).base ( ), cmp ) ) return false; std::sort ( node.begin ( ).base ( ), node.end ( ).base ( ), cmp ); return true; } /** * optimization A3: x + \0 = x * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A3( UnboundedRegExpAlternation < SymbolType > & node ) { bool optimized = false; // alternation with no children is efectively \0 for( auto it = node.begin ( ); it != node.end ( ); ) { if( dynamic_cast < UnboundedRegExpEmpty < SymbolType > * >( & * it ) ) { it = node.erase ( it ); optimized = true; } else it ++; } return optimized; } /** * optimization A4: x + x = x * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A4( UnboundedRegExpAlternation < SymbolType > & node ) { /* * two ways of implementing this opitimization: * - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts... * - check every element against other ( O(n*n) ) * * As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2 */ auto cmp = [ ]( const UnboundedRegExpElement < SymbolType > * a, const UnboundedRegExpElement < SymbolType > * b ) -> bool { return * a == * b; }; size_t size = node.getChildren ( ).size ( ); node.erase ( dereferencer ( ext::unique ( node.begin ( ).base ( ), node.end ( ).base ( ), cmp ) ), node.end( ) ); return size != node.getChildren ( ).size ( ); } /** * optimization A5: x.(y.z) = (x.y).z = x.y.z * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A5( UnboundedRegExpConcatenation < SymbolType > & node ) { bool optimized = false; for( auto it = node.begin( ); it != node.end( ); ) { if( dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * it ) ) { UnboundedRegExpConcatenation < SymbolType > childConcat = static_cast < UnboundedRegExpConcatenation < SymbolType > && > ( std::move ( * it ) ); it = node.erase ( it ); it = node.insert ( it, std::make_move_iterator ( childConcat.begin( ) ), std::make_move_iterator ( childConcat.end( ) ) ); optimized = true; } else ++ it; } return optimized; } /** * optimization A6: \e.x = x.\e = x * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A6( UnboundedRegExpConcatenation < SymbolType > & node ) { bool optimized = false; // concatenation with no children is efectively \e for( auto it = node.begin( ); it != node.end( ); ) { if ( dynamic_cast < UnboundedRegExpEpsilon < SymbolType > * > ( & * it ) ) { it = node.erase ( it ); optimized = true; } else ++ it; } return optimized; } /** * optimization A7: \0.x = x.\0 = \0 * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A7( UnboundedRegExpConcatenation < SymbolType > & node ) { if(node.getChildren ( ).size() == 1) return false; if( std::any_of ( node.begin( ), node.end( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEmpty < SymbolType > * > ( & a ); } ) ) { node.clear( ); node.pushBackChild ( UnboundedRegExpEmpty < SymbolType > ( ) ); return true; } return false; } /** * optimization A8: x.(y+z) = x.y + x.z * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A8( UnboundedRegExpConcatenation < SymbolType > & /* node */) { /* bool optimized = false; for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) { UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast<UnboundedRegExpAlternation < SymbolType >*>( * it ); if( ! alt ) { it ++; continue; } // take everything to the left and copy it as prefix of every element in alternation. UnboundedRegExpConcatenation < SymbolType > leftPart; leftPart.elements.insert( leftPart.elements.end( ), node->elements.begin( ), it ); for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) { UnboundedRegExpConcatenation < SymbolType > * altElem = new UnboundedRegExpConcatenation < SymbolType >( ); altElem->elements.push_back( leftPart ); altElem->elements.push_back( * altIt ); * altIt = altElem; } UnboundedRegExpElement < SymbolType > * optIt = optimize( * it ); delete *it; *it = optIt; it = node->elements.erase( node->elements.begin( ), it ); optimized = true; it ++; } return optimized; */ return false; //TODO } /** * optimization A9: (x+y).z = x.z + y.z * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A9( UnboundedRegExpConcatenation < SymbolType > & /* node */) { /* bool optimized = false; for( auto it = node->elements.begin( ); it != std::prev( node->elements.end( ) ); ) { UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast<UnboundedRegExpAlternation < SymbolType >*>( * it ); if( ! alt ) { it ++; continue; } // take everything to the right and copy it as suffix of every element in alternation. UnboundedRegExpConcatenation < SymbolType > rest; rest.elements.insert( rest.elements.end( ), std::next( it ), node->elements.end( ) ); for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) { UnboundedRegExpConcatenation < SymbolType > * altElem = new UnboundedRegExpConcatenation < SymbolType >( ); altElem->elements.push_back( * altIt ); altElem->elements.push_back( rest ); * altIt = altElem; } UnboundedRegExpElement < SymbolType > * optIt = optimize( * it ); delete *it; *it = optIt; it = node->elements.erase( std::next( it ), node->elements.end( ) ); optimized = true; // as we move (delete) the rest of this expression, it surely wont do another round. More optimizations to be performerd are in subtree now. // we do not care about this here as method optimize(UnboundedRegExpAlternation < SymbolType >) will take care of this in next iteration // it ++; break; } return optimized; */ return false; //TODO } /** * optimization A10: x* = \e + x*x * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A10( UnboundedRegExpAlternation < SymbolType > & /* node */ ) { bool optimized = false; /* * problem: * - \e + x*x = x* * - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node). */ // check if we have some epsilon or iteration left, else nothing to do /*auto eps = find_if( node.getElements().begin( ), node.getElements().end( ), [ ]( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a ) || dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a ); }); if( eps == node.getElements().end( ) ) return false; for( unsigned i = 0; i < node.getChildren ( ).size ( ); i++ ) { UnboundedRegExpConcatenation < SymbolType > * childConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & node.getChildren ( ) [ i ] ); if( ! childConcat ) continue; // if iteration is first element of concatenation UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & childConcat->getElements ( ).front( ) ); if( ! iter ) continue; // concatenation without the iteration node UnboundedRegExpConcatenation < SymbolType > tmpConcat ( * childConcat ); tmpConcat.getChildren ( ).erase ( tmpConcat.getChildren ( ).begin( ) ); ext::ptr_value < UnboundedRegExpElement < SymbolType > > tmpConcatOpt = optimizeInner ( tmpConcat ); // check if the iteration element is the same as the rest of the concatenation if( iter->getElement() == tmpConcatOpt ) { optimized = true; node.setChild ( std::move ( childConcat->getElements().front() ), i ); } } */ return optimized; } /** * optimization A11: x* = (\e + x)* * @param node UnboundedRegExpIteration < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::A11( UnboundedRegExpIteration < SymbolType > & node ) { UnboundedRegExpAlternation < SymbolType > * childAlt = dynamic_cast<UnboundedRegExpAlternation < SymbolType > * >( & node.getChild ( ) ); if( childAlt ) { // check if eps inside iteration's alternation auto eps = find_if ( childAlt->begin( ), childAlt->end( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a ); }); // if no eps if ( eps == childAlt->end( ) ) return false; // remove eps from alternation childAlt->erase ( eps ); return true; } return false; } /** * optimization V1: \0* = \e * @param node UnboundedRegExpIteration < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V1( UnboundedRegExpIteration < SymbolType > &) { // implemented in optimize( UnboundedRegExpIteration < SymbolType > ) return false; } /** * optimization V2: x* + x = x* * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V2( UnboundedRegExpAlternation < SymbolType > & /* node */ ) { bool optimized = false; /* * Bit tricky * We need also to cover the cases like ( a + b + d )* + ( e )* + a + b + c + e = ( a + b + d )* + ( e )* + c */ /*ext::vector < const UnboundedRegExpElement < SymbolType > * > iterElements; // cache iter elements because of operator invalidation after erase for( const UnboundedRegExpElement < SymbolType > & n : node.getElements ( ) ) { const UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & n ); if( iter ) { const UnboundedRegExpAlternation < SymbolType > * inner = dynamic_cast < const UnboundedRegExpAlternation < SymbolType > * > ( & iter->getChild ( ) ); if ( inner ) for ( const UnboundedRegExpElement < SymbolType > & innerElement : inner->getElements ( ) ) iterElements.push_back ( & innerElement ); else iterElements.push_back ( & iter->getChild ( ) ); } } for( const UnboundedRegExpElement < SymbolType > * n : iterElements ) { auto it = find_if ( node.getChildren ( ).begin ( ), node.getChildren ( ).end ( ), [ n ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return a == *n; }); if( it == node.getChildren().end() ) { continue; } optimized = true; node.getChildren ( ).erase( it ); }*/ return optimized; } /** * optimization V3: x** = x* * @param node UnboundedRegExpIteration < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V3 ( UnboundedRegExpIteration < SymbolType > & node ) { UnboundedRegExpIteration < SymbolType > * childIter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & node.getChild ( ) ); if ( childIter ) { node.setChild ( std::move ( childIter->getChild ( ) ) ); return true; } return false; } /** * optimization V4: (x+y)* = (x*y*)* * @param node UnboundedRegExpIteration < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V4( UnboundedRegExpIteration < SymbolType > & node ) { // interpretation: if iteration's element is concat and every concat's element is iteration UnboundedRegExpConcatenation < SymbolType > * cont = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & node.getChild ( ) ); if ( ! cont || ! all_of ( cont->begin( ), cont->end ( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a ); } ) ) return false; UnboundedRegExpAlternation < SymbolType > newAlt; for ( UnboundedRegExpElement < SymbolType > && n : ext::make_mover ( std::move ( * cont ).getChildren ( ) ) ) newAlt.pushBackChild ( std::move ( static_cast < UnboundedRegExpIteration < SymbolType > & > ( n ).getChild ( ) ) ); node.setChild ( optimizeInner ( newAlt ) ); return true; } /** * optimization V5: x*y = y + x*xy * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V5( UnboundedRegExpAlternation < SymbolType > & /* node */ ) { bool optimized = false; // reinterpretation: ax*y = ay+ax*xy // so, if we find iter, // a = everything that is before it (prefix) // x = iter's content behind iter must be exactly iter's content // y = rest (suffix) // prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix /* for( auto itA = node.getChildren().begin( ); itA != node.getChildren().end( ); ) { UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * itA ); if( ! concat ) { ++ itA; continue; } for( auto itC = concat->getChildren().begin( ); itC != std::prev( concat->getChildren().end( ) ); ) { UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * itC ); if( ! iter ) { ++ itC; continue; } // iteration's element must follow the iteration (x*x) auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy // if iter's element is concat if( dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) ) ) { UnboundedRegExpConcatenation < SymbolType > * iterConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) ); if( iterConcat->getChildren().size( ) != ( unsigned ) distance ( std::next ( itC ), concat->getChildren ( ).end ( ) ) || ! equal ( iterConcat->getChildren ( ).begin ( ), iterConcat->getChildren ( ).end ( ), std::next ( itC ), [ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) { ++ itC; continue; } std::advance( itStartY, iterConcat->getChildren().size( ) ); } else { if( iter->getChild() != * std::next( itC ) ) { ++ itC; continue; } std::advance( itStartY, 1 ); } // store everything before iteration as "a" UnboundedRegExpConcatenation < SymbolType > tmpAY; if( concat->getChildren().begin( ) == itC ) { tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) ); } else { UnboundedRegExpConcatenation < SymbolType > tmpA; tmpA.insert( tmpA.getChildren().end( ), concat->getChildren().begin( ), itC ); tmpAY.pushBackChild ( optimizeInner( tmpA ) ); } // store everything behind iteration's followup element as "y" if( itStartY == concat->getChildren().end( ) ) { tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) ); } else { UnboundedRegExpConcatenation < SymbolType > tmpY; tmpY.insert( tmpY.getChildren().end( ), itStartY, concat->getChildren().end( ) ); tmpAY.pushBackChild ( optimizeInner ( tmpY ) ); } // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node.getChildren() ) ext::ptr_value < UnboundedRegExpElement < SymbolType > > regexpAY = optimizeInner( tmpAY ); auto iterAY = find_if ( node.getChildren().begin( ), node.getChildren().end( ), [ & ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return a == regexpAY.get ( ); } ); if( iterAY == node.getChildren().end( ) ) { ++ itC; continue; } tmpAY.insert ( tmpAY.getChildren ( ).begin ( ) + 1, * itC ); node.setChild ( optimizeInner( tmpAY ), itA ); itA = node.getChildren().erase( iterAY ); optimized = true; break; } ++ itA; }*/ return optimized; } /** * optimization V6: x*y = y + xx*y * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V6( UnboundedRegExpAlternation < SymbolType > & /* node */ ) { bool optimized = false; // reinterpretation: ax*y = ay+axx*y // so, if we find iter // a = everything that is before it (prefix) // x = iter's content before iter must be exactly iter's content // y = rest (suffix) // prefix.xx*.suffix + prefix.suffix = prefix.x*.suffix /* for( auto itA = node.getChildren ( ).begin( ); itA != node.getChildren ( ).end( ); ) { UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * itA ); if( ! concat ) { ++ itA; continue; } for( auto itC = std::next( concat->getChildren ( ).begin( ) ); itC != concat->getChildren ( ).end( ); ) { UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * itC ); if( ! iter ) { ++ itC; continue; } // iteration's element must preceed the iteration (xx*) auto itStartX = itC; //itStartX points to first x in expression xx*, everything before is therefore prefix - regexp "a" // if iter's element is concat UnboundedRegExpConcatenation < SymbolType > * iterConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) ); if( iterConcat ) { if( distance( concat->getChildren ( ).begin( ), itC ) < (int) iterConcat->getChildren ( ).size( ) ) { ++ itC; continue; } ext::retract ( itStartX, iterConcat->getChildren().size( ) ); if( iterConcat->getChildren ( ).size( ) != ( unsigned ) distance( itStartX, concat->getChildren ( ).end( ) ) || ! equal ( iterConcat->getChildren ( ).begin ( ), iterConcat->getChildren ( ).end ( ), itStartX, [ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) { ++ itC; continue; } } else { if( iter->getChild ( ) != * std::prev( itC ) ) { ++ itC; continue; } std::advance( itStartX, -1 ); } // concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->getChildren() ) UnboundedRegExpConcatenation < SymbolType > tmpAY; if( concat->getChildren ( ).begin ( ) == itStartX ) { tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) ); } else { UnboundedRegExpConcatenation < SymbolType > tmpA; tmpA.insert ( tmpA.getChildren ( ).end ( ), concat->getChildren().begin ( ), itStartX ); tmpAY.pushBackChild ( optimizeInner ( tmpA ) ); } if( std::next ( itC ) == concat->getChildren().end( ) ) { tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType >( ) ); } else { UnboundedRegExpConcatenation < SymbolType > tmpY; tmpY.insert ( tmpY.getChildren().end( ), std::next( itC ), concat->getChildren ( ).end( ) ); tmpAY.pushBackChild ( optimizeInner( tmpY ) ); } ext::ptr_value < UnboundedRegExpElement < SymbolType > > regexpAY = optimizeInner ( tmpAY ); auto iterAY = find_if( node.getChildren().begin( ), node.getChildren().end( ), [ & ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return a == regexpAY.get ( ); } ); if( iterAY == node.getChildren().end( ) ) { ++ itC; continue; } // if so make a x* y and replace a x x* y tmpAY.insert( tmpAY.getChildren ( ).begin ( ) + 1, * itC ); node.setChild ( optimizeInner( tmpAY ), itA ); // remove a y itA = node.getChildren().erase( iterAY ); optimized = true; break; } ++ itA; } */ return optimized; } /** * optimization V8: \e in h(x) => xx*=x* * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V8( UnboundedRegExpConcatenation < SymbolType > & node ) { bool optimized = false; // interpretation: if there is iteration in concatenation node, and element of iteration contains eps and is straight before this iteration, then this element can be omitted if ( node.getChildren ( ).size ( ) == 0 ) return false; for( auto it = node.getChildren ( ).begin( ); it != node.getChildren ( ).end( ); ) { const UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & * it ); if( ! iter ) { ++ it; continue; } // if element of iteration is concatenation, we need to check this specially const UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < const UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) ); if( concat ) { // check if not out of bounds if( ( unsigned ) distance( node.getChildren ( ).begin( ), it ) < concat->getChildren ( ).size ( ) ) { it ++; continue; } auto it2 = it; ext::retract ( it2, concat->getChildren ( ).size ( ) ); if( regexp::properties::RegExpEpsilon::languageContainsEpsilon ( * concat ) && concat->getChildren().size ( ) == ( unsigned ) distance ( it2, node.getChildren ( ).end( ) ) && equal ( concat->getChildren ( ).begin( ), concat->getChildren ( ).end( ), it2, [ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) { optimized = true; it = node.erase ( it2, it ); } else ++ it; } else { // check if not at the first node if ( it == node.getChildren ( ).begin ( ) ) { it ++; continue; } auto prev = std::prev ( it ); if ( regexp::properties::RegExpEpsilon::languageContainsEpsilon ( iter->getElement ( ) ) && iter->getElement ( ) == * prev ) { it = node.erase ( prev ); optimized = true; } else ++ it; } } return optimized; } /** * optimization V9: (xy)*x = x(yx)* * @param node UnboundedRegExpConcatenation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V9( UnboundedRegExpConcatenation < SymbolType > & /*node*/ ) { bool optimized = false; // interpretation: if concat (C1) with iter && iteration's element is concat (C2), then: // simultaneously iterate through C1 and C2. (axy)*axz=ax(yax)*z -> get ax that is same and relocate them... /*for( auto it = node.getChildren().begin( ) ; it != node.getChildren().end( ) ; ) { UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast<UnboundedRegExpIteration < SymbolType > * > ( & * it ); if ( ! iter ) { ++ it; continue; } UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast<UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild( ) ); if( ! concat ) { ++it; continue; } // find range from <it+1;sth> and <concat.begin;sth> that is equal auto c1Iter = std::next( it ), c2Iter = concat->getChildren().begin( ); while( c1Iter != node.getChildren().end() && c2Iter != concat->getChildren().end( ) && *c1Iter == * c2Iter ) { ++ c1Iter; ++ c2Iter; } if( c1Iter == std::next( it ) ) { ++ it; continue; } // common::Streams::out << "xy" << std::endl; // UnboundedRegExpConcatenation < SymbolType >* tmp = new UnboundedRegExpConcatenation < SymbolType >( ); // tmp->insert( tmp->getChildren().end( ), std::next( it ), c1Iter ); // common::Streams::out << RegExp( tmp ) << std::endl; // copy the range <it;sth>, delete it and go back to the iter node ext::ptr_vector < UnboundedRegExpElement < SymbolType > > copyRange; copyRange.insert ( copyRange.end(), std::next( it ), c1Iter ); it = node.getChildren().erase( std::next( it ), c1Iter ); it = std::prev( it ); // insert that range before it position node.insert( it, copyRange.begin( ), copyRange.end( ) ); // alter the iteration's concat node copyRange.clear( ); copyRange.insert( copyRange.end(), concat->getChildren().begin( ), c2Iter ); concat->getChildren().erase( concat->getChildren().begin( ), c2Iter ); concat->insert( concat->getChildren().end(), copyRange.begin( ), copyRange.end( ) ); }*/ return optimized; } /** * optimization V10: (x+y)* = (x*+y*)* * generalized to (x+y)* = (x+y*)* = (x*+y)* = (x*+y*)* * @param node UnboundedRegExpIteration < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::V10( UnboundedRegExpIteration < SymbolType > & node ) { // interpretation: if iter's child is alternation where some of its children are iteration, then they do not have to be iterations UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast < UnboundedRegExpAlternation < SymbolType > * > ( & node.getChild ( ) ); if ( ! alt || ! any_of ( alt->begin( ), alt->end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a ); } ) ) return false; for ( auto it = alt->begin( ); it != alt->end ( ); ++ it ) { if ( dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * it ) ) alt->setChild ( std::move ( static_cast < UnboundedRegExpIteration < SymbolType > & > ( * it ).getChild ( ) ), it ); } return true; } /** * optimization X1: a* + \e = a* * @param node UnboundedRegExpAlternation < SymbolType > node * @return bool true if optimization applied else false */ template < class SymbolType > bool RegExpOptimize::X1( UnboundedRegExpAlternation < SymbolType > & node ) { // theorem: In regexp like a* + \e, \e is described twice, first in a*, second in \e. // therefore we can delete the \e as it is redundant auto iter = find_if ( node.begin( ), node.end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a );} ); auto eps = find_if ( node.begin( ), node.end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a );} ); if( iter != node.end( ) && eps != node.end( ) ) { node.erase( eps ); return true; } return false; } } /* namespace simplify */ } /* namespace regexp */