-
Jan Trávníček authoredJan Trávníček authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
RegExpOptimizeUnboundedPart.hpp 33.75 KiB
/*
* RegExpOptimize.cpp
*
* Created on: 20. 1. 2014
* Author: Tomas Pecka
*/
#include <iostream>
namespace regexp {
namespace simplify {
template < class SymbolType >
void RegExpOptimize::optimize( UnboundedRegExpAlternation < SymbolType > & alt ) {
for( unsigned i = 0; i < alt.getChildren ( ).size ( ); i++ )
alt.setChild ( optimizeInner ( alt.getChildren ( ) [ i ] ), i );
while ( A1( alt ) || A2( alt ) || A3( alt ) || A4( alt ) || A10( alt ) || V2( alt ) || V5( alt ) || V6( alt ) || X1( alt ) );
}
template < class SymbolType >
void RegExpOptimize::optimize( UnboundedRegExpConcatenation < SymbolType > & concat ) {
for( unsigned i = 0; i < concat.getChildren ( ).size ( ); i++ )
concat.setChild ( optimizeInner ( concat.getChildren ( ) [ i ] ), i );
while ( A5( concat ) || A6( concat ) || A7( concat ) || A8( concat ) || A9( concat ) || V8( concat ) || V9( concat ) );
}
template < class SymbolType >
void RegExpOptimize::optimize( UnboundedRegExpIteration < SymbolType > & iter ) {
iter.setChild ( optimizeInner ( iter.getChild ( ) ) );
while ( A11( iter ) || V1( iter ) || V3( iter ) || V4( iter ) || V10( iter ) );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpElement < SymbolType > & node ) {
const UnboundedRegExpAlternation < SymbolType > * alternation = dynamic_cast<const UnboundedRegExpAlternation < SymbolType >*>( & node );
if( alternation )
return optimizeInner( * alternation );
const UnboundedRegExpConcatenation < SymbolType > * concatenation = dynamic_cast<const UnboundedRegExpConcatenation < SymbolType >*>( & node );
if( concatenation )
return optimizeInner( * concatenation );
const UnboundedRegExpIteration < SymbolType > * iteration = dynamic_cast<const UnboundedRegExpIteration < SymbolType >*>( & node );
if( iteration )
return optimizeInner( * iteration );
const UnboundedRegExpSymbol < SymbolType > * symbol = dynamic_cast<const UnboundedRegExpSymbol < SymbolType >*>( & node );
if( symbol )
return optimizeInner( * symbol );
const UnboundedRegExpEmpty < SymbolType > * empty = dynamic_cast<const UnboundedRegExpEmpty < SymbolType >*>( & node );
if( empty )
return optimizeInner( * empty );
const UnboundedRegExpEpsilon < SymbolType > * eps = dynamic_cast<const UnboundedRegExpEpsilon < SymbolType >*>( & node );
if( eps )
return optimizeInner( * eps );
throw exception::CommonException( "RegExpOptimize::optimize - unknown UnboundedRegExpElement < SymbolType > node" );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpAlternation < SymbolType > & node ) {
UnboundedRegExpAlternation < SymbolType > alt;
for( const UnboundedRegExpElement < SymbolType > & child : node.getElements ( ) )
alt.pushBackChild ( optimizeInner( child ) );
// optimize while you can
while( A1 ( alt ) || A2 ( alt ) || A3 ( alt ) || A4 ( alt ) || A10 ( alt ) || V2 ( alt ) || V5 ( alt ) || V6 ( alt ) || X1 ( alt ) );
if( alt.getElements ( ).size( ) == 1 )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( alt.getChildren ( ).front ( ) ) );
if( alt.getElements ( ).size( ) == 0 )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEmpty < SymbolType > ( ) );
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( alt ) );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpConcatenation < SymbolType > & node ) {
UnboundedRegExpConcatenation < SymbolType > concat;
for( const UnboundedRegExpElement < SymbolType > & child : node.getElements ( ) )
concat.pushBackChild ( optimizeInner( child ) );
while( A5 ( concat ) || A6 ( concat ) || A7 ( concat ) || A8 ( concat ) || A9 ( concat ) || V8 ( concat ) || V9 ( concat ) );
if( concat.getElements ( ).size( ) == 1 )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( concat.getChildren ( ).front ( ) ) );
if( concat.getElements ( ).size( ) == 0 )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) );
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( std::move ( concat ) );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpIteration < SymbolType > & node ) {
UnboundedRegExpIteration < SymbolType > iter ( optimizeInner ( node.getChild ( ) ) );
while ( A11 ( iter ) || V1 ( iter ) || V3 ( iter ) || V4 ( iter ) || V10 ( iter ) );
// V1 is implemented right here
if( dynamic_cast < UnboundedRegExpEmpty < SymbolType > * > ( & iter.getChild ( ) ) )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) );
// T1 is implemented right here \e* = \e
if ( dynamic_cast < UnboundedRegExpEpsilon < SymbolType > * > ( & iter.getChild ( ) ) )
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( UnboundedRegExpEpsilon < SymbolType > ( ) );
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( iter );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpSymbol < SymbolType > & node ) {
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpEmpty < SymbolType > & node ) {
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node );
}
template < class SymbolType >
ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > RegExpOptimize::optimizeInner( const UnboundedRegExpEpsilon < SymbolType > & node ) {
return ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > > ( node );
}
/**
* optimization A1: x + ( y + z ) = ( x + y ) + z = x + y + z
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A1( UnboundedRegExpAlternation < SymbolType > & node ) {
bool optimized = false;
for( auto it = node.begin( ); it != node.end( ); ) {
if( dynamic_cast < UnboundedRegExpAlternation < SymbolType > * > ( & * it ) ) {
UnboundedRegExpAlternation < SymbolType > childAlt ( static_cast < UnboundedRegExpAlternation < SymbolType > && > ( std::move ( * it ) ) );
it = node.erase( it );
it = node.insert ( it, std::make_move_iterator ( childAlt.begin ( ) ), std::make_move_iterator ( childAlt.end ( ) ) );
optimized = true;
} else
it ++;
}
return optimized;
}
/**
* optimization A2: x + y = y + x (sort)
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A2( UnboundedRegExpAlternation < SymbolType > & node ) {
auto cmp = [ ]( const UnboundedRegExpElement < SymbolType > * a, const UnboundedRegExpElement < SymbolType > * b ) -> bool { return * a < * b; };
if ( std::is_sorted ( node.begin( ).base ( ), node.end( ).base ( ), cmp ) )
return false;
std::sort ( node.begin ( ).base ( ), node.end ( ).base ( ), cmp );
return true;
}
/**
* optimization A3: x + \0 = x
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A3( UnboundedRegExpAlternation < SymbolType > & node ) {
bool optimized = false;
// alternation with no children is efectively \0
for( auto it = node.begin ( ); it != node.end ( ); ) {
if( dynamic_cast < UnboundedRegExpEmpty < SymbolType > * >( & * it ) ) {
it = node.erase ( it );
optimized = true;
} else
it ++;
}
return optimized;
}
/**
* optimization A4: x + x = x
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A4( UnboundedRegExpAlternation < SymbolType > & node ) {
/*
* two ways of implementing this opitimization:
* - sort and call std::unique ( O(n lg n) + O(n) ), but it also sorts...
* - check every element against other ( O(n*n) )
*
* As we always sort in optimization, we can use the first version, but A4 must be __always__ called __after__ A2
*/
auto cmp = [ ]( const UnboundedRegExpElement < SymbolType > * a, const UnboundedRegExpElement < SymbolType > * b ) -> bool { return * a == * b; };
size_t size = node.getChildren ( ).size ( );
node.erase ( dereferencer ( ext::unique ( node.begin ( ).base ( ), node.end ( ).base ( ), cmp ) ), node.end( ) );
return size != node.getChildren ( ).size ( );
}
/**
* optimization A5: x.(y.z) = (x.y).z = x.y.z
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A5( UnboundedRegExpConcatenation < SymbolType > & node ) {
bool optimized = false;
for( auto it = node.begin( ); it != node.end( ); ) {
if( dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * it ) ) {
UnboundedRegExpConcatenation < SymbolType > childConcat = static_cast < UnboundedRegExpConcatenation < SymbolType > && > ( std::move ( * it ) );
it = node.erase ( it );
it = node.insert ( it, std::make_move_iterator ( childConcat.begin( ) ), std::make_move_iterator ( childConcat.end( ) ) );
optimized = true;
} else
++ it;
}
return optimized;
}
/**
* optimization A6: \e.x = x.\e = x
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A6( UnboundedRegExpConcatenation < SymbolType > & node ) {
bool optimized = false;
// concatenation with no children is efectively \e
for( auto it = node.begin( ); it != node.end( ); ) {
if ( dynamic_cast < UnboundedRegExpEpsilon < SymbolType > * > ( & * it ) ) {
it = node.erase ( it );
optimized = true;
} else
++ it;
}
return optimized;
}
/**
* optimization A7: \0.x = x.\0 = \0
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A7( UnboundedRegExpConcatenation < SymbolType > & node ) {
if(node.getChildren ( ).size() == 1) return false;
if( std::any_of ( node.begin( ), node.end( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEmpty < SymbolType > * > ( & a ); } ) ) {
node.clear( );
node.pushBackChild ( UnboundedRegExpEmpty < SymbolType > ( ) );
return true;
}
return false;
}
/**
* optimization A8: x.(y+z) = x.y + x.z
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A8( UnboundedRegExpConcatenation < SymbolType > & /* node */) {
/*
bool optimized = false;
for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); )
{
UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast<UnboundedRegExpAlternation < SymbolType >*>( * it );
if( ! alt )
{
it ++;
continue;
}
// take everything to the left and copy it as prefix of every element in alternation.
UnboundedRegExpConcatenation < SymbolType > leftPart;
leftPart.elements.insert( leftPart.elements.end( ), node->elements.begin( ), it );
for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ )
{
UnboundedRegExpConcatenation < SymbolType > * altElem = new UnboundedRegExpConcatenation < SymbolType >( );
altElem->elements.push_back( leftPart );
altElem->elements.push_back( * altIt );
* altIt = altElem;
}
UnboundedRegExpElement < SymbolType > * optIt = optimize( * it );
delete *it;
*it = optIt;
it = node->elements.erase( node->elements.begin( ), it );
optimized = true;
it ++;
}
return optimized;
*/
return false; //TODO
}
/**
* optimization A9: (x+y).z = x.z + y.z
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A9( UnboundedRegExpConcatenation < SymbolType > & /* node */) {
/*
bool optimized = false;
for( auto it = node->elements.begin( ); it != std::prev( node->elements.end( ) ); )
{
UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast<UnboundedRegExpAlternation < SymbolType >*>( * it );
if( ! alt )
{
it ++;
continue;
}
// take everything to the right and copy it as suffix of every element in alternation.
UnboundedRegExpConcatenation < SymbolType > rest;
rest.elements.insert( rest.elements.end( ), std::next( it ), node->elements.end( ) );
for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ )
{
UnboundedRegExpConcatenation < SymbolType > * altElem = new UnboundedRegExpConcatenation < SymbolType >( );
altElem->elements.push_back( * altIt );
altElem->elements.push_back( rest );
* altIt = altElem;
}
UnboundedRegExpElement < SymbolType > * optIt = optimize( * it );
delete *it;
*it = optIt;
it = node->elements.erase( std::next( it ), node->elements.end( ) );
optimized = true;
// as we move (delete) the rest of this expression, it surely wont do another round. More optimizations to be performerd are in subtree now.
// we do not care about this here as method optimize(UnboundedRegExpAlternation < SymbolType >) will take care of this in next iteration
// it ++;
break;
}
return optimized;
*/
return false; //TODO
}
/**
* optimization A10: x* = \e + x*x
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A10( UnboundedRegExpAlternation < SymbolType > & /* node */ ) {
bool optimized = false;
/*
* problem:
* - \e + x*x = x*
* - but if we do not have the eps, but we do have iteration, then \e \in h(iter), therefore \e in h(node).
*/
// check if we have some epsilon or iteration left, else nothing to do
/*auto eps = find_if( node.getElements().begin( ), node.getElements().end( ), [ ]( const UnboundedRegExpElement < SymbolType > & a ) -> bool {
return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a ) || dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a );
});
if( eps == node.getElements().end( ) )
return false;
for( unsigned i = 0; i < node.getChildren ( ).size ( ); i++ ) {
UnboundedRegExpConcatenation < SymbolType > * childConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & node.getChildren ( ) [ i ] );
if( ! childConcat )
continue;
// if iteration is first element of concatenation
UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & childConcat->getElements ( ).front( ) );
if( ! iter )
continue;
// concatenation without the iteration node
UnboundedRegExpConcatenation < SymbolType > tmpConcat ( * childConcat );
tmpConcat.getChildren ( ).erase ( tmpConcat.getChildren ( ).begin( ) );
ext::ptr_value < UnboundedRegExpElement < SymbolType > > tmpConcatOpt = optimizeInner ( tmpConcat );
// check if the iteration element is the same as the rest of the concatenation
if( iter->getElement() == tmpConcatOpt ) {
optimized = true;
node.setChild ( std::move ( childConcat->getElements().front() ), i );
}
} */
return optimized;
}
/**
* optimization A11: x* = (\e + x)*
* @param node UnboundedRegExpIteration < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::A11( UnboundedRegExpIteration < SymbolType > & node ) {
UnboundedRegExpAlternation < SymbolType > * childAlt = dynamic_cast<UnboundedRegExpAlternation < SymbolType > * >( & node.getChild ( ) );
if( childAlt ) {
// check if eps inside iteration's alternation
auto eps = find_if ( childAlt->begin( ), childAlt->end( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool {
return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a );
});
// if no eps
if ( eps == childAlt->end( ) )
return false;
// remove eps from alternation
childAlt->erase ( eps );
return true;
}
return false;
}
/**
* optimization V1: \0* = \e
* @param node UnboundedRegExpIteration < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V1( UnboundedRegExpIteration < SymbolType > &) {
// implemented in optimize( UnboundedRegExpIteration < SymbolType > )
return false;
}
/**
* optimization V2: x* + x = x*
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V2( UnboundedRegExpAlternation < SymbolType > & /* node */ ) {
bool optimized = false;
/*
* Bit tricky
* We need also to cover the cases like ( a + b + d )* + ( e )* + a + b + c + e = ( a + b + d )* + ( e )* + c
*/
/*ext::vector < const UnboundedRegExpElement < SymbolType > * > iterElements;
// cache iter elements because of operator invalidation after erase
for( const UnboundedRegExpElement < SymbolType > & n : node.getElements ( ) ) {
const UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & n );
if( iter ) {
const UnboundedRegExpAlternation < SymbolType > * inner = dynamic_cast < const UnboundedRegExpAlternation < SymbolType > * > ( & iter->getChild ( ) );
if ( inner )
for ( const UnboundedRegExpElement < SymbolType > & innerElement : inner->getElements ( ) )
iterElements.push_back ( & innerElement );
else
iterElements.push_back ( & iter->getChild ( ) );
}
}
for( const UnboundedRegExpElement < SymbolType > * n : iterElements ) {
auto it = find_if ( node.getChildren ( ).begin ( ), node.getChildren ( ).end ( ), [ n ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool {
return a == *n;
});
if( it == node.getChildren().end() ) {
continue;
}
optimized = true;
node.getChildren ( ).erase( it );
}*/
return optimized;
}
/**
* optimization V3: x** = x*
* @param node UnboundedRegExpIteration < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V3 ( UnboundedRegExpIteration < SymbolType > & node ) {
UnboundedRegExpIteration < SymbolType > * childIter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & node.getChild ( ) );
if ( childIter ) {
node.setChild ( std::move ( childIter->getChild ( ) ) );
return true;
}
return false;
}
/**
* optimization V4: (x+y)* = (x*y*)*
* @param node UnboundedRegExpIteration < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V4( UnboundedRegExpIteration < SymbolType > & node ) {
// interpretation: if iteration's element is concat and every concat's element is iteration
UnboundedRegExpConcatenation < SymbolType > * cont = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & node.getChild ( ) );
if ( ! cont || ! all_of ( cont->begin( ), cont->end ( ), [ ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a ); } ) )
return false;
UnboundedRegExpAlternation < SymbolType > newAlt;
for ( UnboundedRegExpElement < SymbolType > && n : ext::make_mover ( std::move ( * cont ).getChildren ( ) ) )
newAlt.pushBackChild ( std::move ( static_cast < UnboundedRegExpIteration < SymbolType > & > ( n ).getChild ( ) ) );
node.setChild ( optimizeInner ( newAlt ) );
return true;
}
/**
* optimization V5: x*y = y + x*xy
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V5( UnboundedRegExpAlternation < SymbolType > & /* node */ ) {
bool optimized = false;
// reinterpretation: ax*y = ay+ax*xy
// so, if we find iter,
// a = everything that is before it (prefix)
// x = iter's content behind iter must be exactly iter's content
// y = rest (suffix)
// prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix
/* for( auto itA = node.getChildren().begin( ); itA != node.getChildren().end( ); ) {
UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * itA );
if( ! concat ) {
++ itA;
continue;
}
for( auto itC = concat->getChildren().begin( ); itC != std::prev( concat->getChildren().end( ) ); ) {
UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * itC );
if( ! iter ) {
++ itC;
continue;
}
// iteration's element must follow the iteration (x*x)
auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy
// if iter's element is concat
if( dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) ) ) {
UnboundedRegExpConcatenation < SymbolType > * iterConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) );
if( iterConcat->getChildren().size( ) != ( unsigned ) distance ( std::next ( itC ), concat->getChildren ( ).end ( ) )
|| ! equal ( iterConcat->getChildren ( ).begin ( ), iterConcat->getChildren ( ).end ( ), std::next ( itC ),
[ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) {
++ itC;
continue;
}
std::advance( itStartY, iterConcat->getChildren().size( ) );
} else {
if( iter->getChild() != * std::next( itC ) ) {
++ itC;
continue;
}
std::advance( itStartY, 1 );
}
// store everything before iteration as "a"
UnboundedRegExpConcatenation < SymbolType > tmpAY;
if( concat->getChildren().begin( ) == itC ) {
tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) );
} else {
UnboundedRegExpConcatenation < SymbolType > tmpA;
tmpA.insert( tmpA.getChildren().end( ), concat->getChildren().begin( ), itC );
tmpAY.pushBackChild ( optimizeInner( tmpA ) );
}
// store everything behind iteration's followup element as "y"
if( itStartY == concat->getChildren().end( ) ) {
tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) );
} else {
UnboundedRegExpConcatenation < SymbolType > tmpY;
tmpY.insert( tmpY.getChildren().end( ), itStartY, concat->getChildren().end( ) );
tmpAY.pushBackChild ( optimizeInner ( tmpY ) );
}
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node.getChildren() )
ext::ptr_value < UnboundedRegExpElement < SymbolType > > regexpAY = optimizeInner( tmpAY );
auto iterAY = find_if ( node.getChildren().begin( ), node.getChildren().end( ), [ & ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return a == regexpAY.get ( ); } );
if( iterAY == node.getChildren().end( ) ) {
++ itC;
continue;
}
tmpAY.insert ( tmpAY.getChildren ( ).begin ( ) + 1, * itC );
node.setChild ( optimizeInner( tmpAY ), itA );
itA = node.getChildren().erase( iterAY );
optimized = true;
break;
}
++ itA;
}*/
return optimized;
}
/**
* optimization V6: x*y = y + xx*y
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V6( UnboundedRegExpAlternation < SymbolType > & /* node */ ) {
bool optimized = false;
// reinterpretation: ax*y = ay+axx*y
// so, if we find iter
// a = everything that is before it (prefix)
// x = iter's content before iter must be exactly iter's content
// y = rest (suffix)
// prefix.xx*.suffix + prefix.suffix = prefix.x*.suffix
/* for( auto itA = node.getChildren ( ).begin( ); itA != node.getChildren ( ).end( ); ) {
UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & * itA );
if( ! concat ) {
++ itA;
continue;
}
for( auto itC = std::next( concat->getChildren ( ).begin( ) ); itC != concat->getChildren ( ).end( ); ) {
UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * itC );
if( ! iter ) {
++ itC;
continue;
}
// iteration's element must preceed the iteration (xx*)
auto itStartX = itC; //itStartX points to first x in expression xx*, everything before is therefore prefix - regexp "a"
// if iter's element is concat
UnboundedRegExpConcatenation < SymbolType > * iterConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) );
if( iterConcat ) {
if( distance( concat->getChildren ( ).begin( ), itC ) < (int) iterConcat->getChildren ( ).size( ) ) {
++ itC;
continue;
}
ext::retract ( itStartX, iterConcat->getChildren().size( ) );
if( iterConcat->getChildren ( ).size( ) != ( unsigned ) distance( itStartX, concat->getChildren ( ).end( ) )
|| ! equal ( iterConcat->getChildren ( ).begin ( ), iterConcat->getChildren ( ).end ( ), itStartX,
[ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) {
++ itC;
continue;
}
} else {
if( iter->getChild ( ) != * std::prev( itC ) ) {
++ itC;
continue;
}
std::advance( itStartX, -1 );
}
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->getChildren() )
UnboundedRegExpConcatenation < SymbolType > tmpAY;
if( concat->getChildren ( ).begin ( ) == itStartX ) {
tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType > ( ) );
} else {
UnboundedRegExpConcatenation < SymbolType > tmpA;
tmpA.insert ( tmpA.getChildren ( ).end ( ), concat->getChildren().begin ( ), itStartX );
tmpAY.pushBackChild ( optimizeInner ( tmpA ) );
}
if( std::next ( itC ) == concat->getChildren().end( ) ) {
tmpAY.pushBackChild ( UnboundedRegExpEpsilon < SymbolType >( ) );
} else {
UnboundedRegExpConcatenation < SymbolType > tmpY;
tmpY.insert ( tmpY.getChildren().end( ), std::next( itC ), concat->getChildren ( ).end( ) );
tmpAY.pushBackChild ( optimizeInner( tmpY ) );
}
ext::ptr_value < UnboundedRegExpElement < SymbolType > > regexpAY = optimizeInner ( tmpAY );
auto iterAY = find_if( node.getChildren().begin( ), node.getChildren().end( ), [ & ] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return a == regexpAY.get ( ); } );
if( iterAY == node.getChildren().end( ) ) {
++ itC;
continue;
}
// if so make a x* y and replace a x x* y
tmpAY.insert( tmpAY.getChildren ( ).begin ( ) + 1, * itC );
node.setChild ( optimizeInner( tmpAY ), itA );
// remove a y
itA = node.getChildren().erase( iterAY );
optimized = true;
break;
}
++ itA;
} */
return optimized;
}
/**
* optimization V8: \e in h(x) => xx*=x*
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V8( UnboundedRegExpConcatenation < SymbolType > & node ) {
bool optimized = false;
// interpretation: if there is iteration in concatenation node, and element of iteration contains eps and is straight before this iteration, then this element can be omitted
if ( node.getChildren ( ).size ( ) == 0 )
return false;
for( auto it = node.getChildren ( ).begin( ); it != node.getChildren ( ).end( ); ) {
const UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & * it );
if( ! iter ) {
++ it;
continue;
}
// if element of iteration is concatenation, we need to check this specially
const UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast < const UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild ( ) );
if( concat ) {
// check if not out of bounds
if( ( unsigned ) distance( node.getChildren ( ).begin( ), it ) < concat->getChildren ( ).size ( ) ) {
it ++;
continue;
}
auto it2 = it;
ext::retract ( it2, concat->getChildren ( ).size ( ) );
if( regexp::properties::RegExpEpsilon::languageContainsEpsilon ( * concat ) &&
concat->getChildren().size ( ) == ( unsigned ) distance ( it2, node.getChildren ( ).end( ) ) &&
equal ( concat->getChildren ( ).begin( ), concat->getChildren ( ).end( ), it2, [ ] ( const UnboundedRegExpElement < SymbolType > & a, const UnboundedRegExpElement < SymbolType > & b ) -> bool { return a == b; } ) ) {
optimized = true;
it = node.erase ( it2, it );
} else
++ it;
} else {
// check if not at the first node
if ( it == node.getChildren ( ).begin ( ) ) {
it ++;
continue;
}
auto prev = std::prev ( it );
if ( regexp::properties::RegExpEpsilon::languageContainsEpsilon ( iter->getElement ( ) ) && iter->getElement ( ) == * prev ) {
it = node.erase ( prev );
optimized = true;
} else
++ it;
}
}
return optimized;
}
/**
* optimization V9: (xy)*x = x(yx)*
* @param node UnboundedRegExpConcatenation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V9( UnboundedRegExpConcatenation < SymbolType > & /*node*/ ) {
bool optimized = false;
// interpretation: if concat (C1) with iter && iteration's element is concat (C2), then:
// simultaneously iterate through C1 and C2. (axy)*axz=ax(yax)*z -> get ax that is same and relocate them...
/*for( auto it = node.getChildren().begin( ) ; it != node.getChildren().end( ) ; ) {
UnboundedRegExpIteration < SymbolType > * iter = dynamic_cast<UnboundedRegExpIteration < SymbolType > * > ( & * it );
if ( ! iter ) {
++ it;
continue;
}
UnboundedRegExpConcatenation < SymbolType > * concat = dynamic_cast<UnboundedRegExpConcatenation < SymbolType > * > ( & iter->getChild( ) );
if( ! concat ) {
++it;
continue;
}
// find range from <it+1;sth> and <concat.begin;sth> that is equal
auto c1Iter = std::next( it ), c2Iter = concat->getChildren().begin( );
while( c1Iter != node.getChildren().end() && c2Iter != concat->getChildren().end( ) && *c1Iter == * c2Iter ) {
++ c1Iter;
++ c2Iter;
}
if( c1Iter == std::next( it ) ) {
++ it;
continue;
}
// common::Streams::out << "xy" << std::endl;
// UnboundedRegExpConcatenation < SymbolType >* tmp = new UnboundedRegExpConcatenation < SymbolType >( );
// tmp->insert( tmp->getChildren().end( ), std::next( it ), c1Iter );
// common::Streams::out << RegExp( tmp ) << std::endl;
// copy the range <it;sth>, delete it and go back to the iter node
ext::ptr_vector < UnboundedRegExpElement < SymbolType > > copyRange;
copyRange.insert ( copyRange.end(), std::next( it ), c1Iter );
it = node.getChildren().erase( std::next( it ), c1Iter );
it = std::prev( it );
// insert that range before it position
node.insert( it, copyRange.begin( ), copyRange.end( ) );
// alter the iteration's concat node
copyRange.clear( );
copyRange.insert( copyRange.end(), concat->getChildren().begin( ), c2Iter );
concat->getChildren().erase( concat->getChildren().begin( ), c2Iter );
concat->insert( concat->getChildren().end(), copyRange.begin( ), copyRange.end( ) );
}*/
return optimized;
}
/**
* optimization V10: (x+y)* = (x*+y*)*
* generalized to (x+y)* = (x+y*)* = (x*+y)* = (x*+y*)*
* @param node UnboundedRegExpIteration < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::V10( UnboundedRegExpIteration < SymbolType > & node ) {
// interpretation: if iter's child is alternation where some of its children are iteration, then they do not have to be iterations
UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast < UnboundedRegExpAlternation < SymbolType > * > ( & node.getChild ( ) );
if ( ! alt || ! any_of ( alt->begin( ), alt->end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a ); } ) )
return false;
for ( auto it = alt->begin( ); it != alt->end ( ); ++ it ) {
if ( dynamic_cast < UnboundedRegExpIteration < SymbolType > * > ( & * it ) )
alt->setChild ( std::move ( static_cast < UnboundedRegExpIteration < SymbolType > & > ( * it ).getChild ( ) ), it );
}
return true;
}
/**
* optimization X1: a* + \e = a*
* @param node UnboundedRegExpAlternation < SymbolType > node
* @return bool true if optimization applied else false
*/
template < class SymbolType >
bool RegExpOptimize::X1( UnboundedRegExpAlternation < SymbolType > & node ) {
// theorem: In regexp like a* + \e, \e is described twice, first in a*, second in \e.
// therefore we can delete the \e as it is redundant
auto iter = find_if ( node.begin( ), node.end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpIteration < SymbolType > * > ( & a );} );
auto eps = find_if ( node.begin( ), node.end( ), [] ( const UnboundedRegExpElement < SymbolType > & a ) -> bool { return dynamic_cast < const UnboundedRegExpEpsilon < SymbolType > * > ( & a );} );
if( iter != node.end( ) && eps != node.end( ) ) {
node.erase( eps );
return true;
}
return false;
}
} /* namespace simplify */
} /* namespace regexp */