Skip to content
Snippets Groups Projects
Commit cc4946e5 authored by Jan Trávníček's avatar Jan Trávníček
Browse files

add some regexp optimize tests

parent 2ef3d26c
No related branches found
No related tags found
No related merge requests found
......@@ -218,7 +218,7 @@ bool RegExpOptimize::A4( UnboundedRegExpAlternation & node ) {
auto cmp = [ ]( const std::smart_ptr < UnboundedRegExpElement > & a, const std::smart_ptr < UnboundedRegExpElement > & b ) -> bool { return *a == *b; };
 
size_t size = node.getChildren ( ).size ( );
std::unique ( node.getChildren ( ).begin ( ), node.getChildren ( ).end ( ), cmp);
node.getChildren ( ).erase ( std::unique ( node.getChildren ( ).begin ( ), node.getChildren ( ).end ( ), cmp), node.getChildren ( ).end( ) );
 
return size != node.getChildren ( ).size ( );
}
......@@ -556,31 +556,27 @@ bool RegExpOptimize::V4( UnboundedRegExpIteration & node ) {
* @param node UnboundedRegExpAlternation node
* @return bool true if optimization applied else false
*/
bool RegExpOptimize::V5( UnboundedRegExpAlternation & /* node */ ) {
/* bool optimized = false; */
bool RegExpOptimize::V5( UnboundedRegExpAlternation & node ) {
bool optimized = false;
 
// reinterpretation: ax*y = ay+ax*xy
// so, if we find iter, a = everything that is before it (prefix)
// x = iter's content
// behind iter must be exactly iter's content
// y = rest (suffix)
// so, if we find iter,
// a = everything that is before it (prefix)
// x = iter's content behind iter must be exactly iter's content
// y = rest (suffix)
// prefix.x*x.suffix + prefix.suffix = prefix.x*.suffix
 
/* for( auto itA = node->elements.begin( ); itA != node->elements.end( ); )
{
for( auto itA = node.getChildren().begin( ); itA != node.getChildren().end( ); ) {
UnboundedRegExpConcatenation * concat = dynamic_cast<UnboundedRegExpConcatenation*>( itA->get() );
if( ! concat )
{
itA ++;
if( ! concat ) {
++ itA;
continue;
}
 
for( auto itC = concat->elements.begin( ); itC != std::prev( concat->elements.end( ) ); )
{
for( auto itC = concat->getChildren().begin( ); itC != std::prev( concat->getChildren().end( ) ); ) {
UnboundedRegExpIteration * iter = dynamic_cast<UnboundedRegExpIteration*>( itC->get() );
if( ! iter )
{
itC ++;
if( ! iter ) {
++ itC;
continue;
}
 
......@@ -588,32 +584,19 @@ bool RegExpOptimize::V5( UnboundedRegExpAlternation & /* node */ ) {
auto itStartY = std::next( itC ); //itStartY points to y in expression x*xy
 
// if iter's element is concat
if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->element.get() ) )
{
UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->element.get() );
// std::cout << "....." << std::endl;
// std::cout << RegExp( concat ) << std::endl;
// std::cout << RegExp( iterConcat ) << std::endl;
// UnboundedRegExpConcatenation * tmp = new UnboundedRegExpConcatenation( );
// tmp->elements.insert( tmp->elements.end( ), std::next( itC ), concat->elements.end( ) );
// std::cout << RegExp( tmp) << std::endl;
if( distance( iterConcat->elements.begin( ), iterConcat->elements.end( ) ) != distance( std::next( itC ), concat->elements.end( ) )
|| ! equal( iterConcat->elements.begin( ), iterConcat->elements.end( ), std::next( itC ),
[ ]( const std::smart_ptr < UnboundedRegExpElement > & a, const std::smart_ptr < UnboundedRegExpElement > & b ) -> bool{ return *a == *b; } ) )
{
if( dynamic_cast<UnboundedRegExpConcatenation*>( iter->getChild().get() ) ) {
UnboundedRegExpConcatenation * iterConcat = dynamic_cast<UnboundedRegExpConcatenation*>( iter->getChild().get() );
if( iterConcat->getChildren().size( ) != ( unsigned ) distance( std::next( itC ), concat->getChildren().end( ) )
|| ! equal( iterConcat->getChildren().begin( ), iterConcat->getChildren().end( ), std::next( itC ),
[ ]( const std::smart_ptr < UnboundedRegExpElement > & a, const std::smart_ptr < UnboundedRegExpElement > & b ) -> bool{ return *a == *b; } ) ) {
itC++;
continue;
}
advance( itStartY, (int)iterConcat->elements.size( ) );
}
// else
else
{
if( * iter->element != ** std::next( itC ) )
{
itC ++;
advance( itStartY, (int)iterConcat->getChildren().size( ) );
} else {
if( * iter->getChild() != ** std::next( itC ) ) {
++ itC;
continue;
}
 
......@@ -622,86 +605,60 @@ bool RegExpOptimize::V5( UnboundedRegExpAlternation & /* node */ ) {
 
// store everything before iteration as "a"
UnboundedRegExpElement * regexpA;
if( concat->elements.begin( ) == itC )
{
if( concat->getChildren().begin( ) == itC ) {
regexpA = new UnboundedRegExpEpsilon( );
}
else
{
UnboundedRegExpConcatenation * tmpA = new UnboundedRegExpConcatenation( );
tmpA->elements.insert( tmpA->elements.end( ), concat->elements.begin( ), itC );
regexpA = optimize( tmpA );
tmpA->elements.clear( );
delete tmpA;
} else {
UnboundedRegExpConcatenation tmpA;
tmpA.insert( tmpA.getChildren().end( ), concat->getChildren().begin( ), itC );
regexpA = optimizeInner( tmpA );
}
 
// store everything behind iteration's followup element as "y"
UnboundedRegExpElement * regexpY;
if( itStartY == concat->elements.end( ) )
{
if( itStartY == concat->getChildren().end( ) ) {
regexpY = new UnboundedRegExpEpsilon( );
}
else
{
UnboundedRegExpConcatenation* tmpY = new UnboundedRegExpConcatenation( );
tmpY->elements.insert( tmpY->elements.end( ), itStartY, concat->elements.end( ) );
regexpY = optimize( tmpY );
tmpY->elements.clear( );
delete tmpY;
} else {
UnboundedRegExpConcatenation tmpY;
tmpY.insert( tmpY.getChildren().end( ), itStartY, concat->getChildren().end( ) );
regexpY = optimizeInner( tmpY );
}
 
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->elements )
UnboundedRegExpConcatenation* tmpAY = new UnboundedRegExpConcatenation( );
tmpAY->elements.push_back( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAY->elements.push_back( std::smart_ptr < UnboundedRegExpElement > ( regexpY ) );
UnboundedRegExpElement * regexpAY = optimize( tmpAY );
regexpA = tmpAY->elements[0].release();
regexpY = tmpAY->elements[1].release();
tmpAY->elements.clear( );
delete tmpAY;
auto iterAY = find_if( node->elements.begin( ), node->elements.end( ), [ regexpAY ] ( const std::smart_ptr < UnboundedRegExpElement > & a ) -> bool{ return *a == *regexpAY; } );
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node.getChildren() )
UnboundedRegExpConcatenation tmpAY;
tmpAY.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAY.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpY ) );
UnboundedRegExpElement * regexpAY = optimizeInner( tmpAY );
regexpA = tmpAY.getChildren()[0].release();
regexpY = tmpAY.getChildren()[1].release();
auto iterAY = find_if( node.getChildren().begin( ), node.getChildren().end( ), [ regexpAY ] ( const std::smart_ptr < UnboundedRegExpElement > & a ) -> bool{ return *a == *regexpAY; } );
delete regexpAY;
if( iterAY == node->elements.end( ) )
{
itC ++;
if( iterAY == node.getChildren().end( ) ) {
++ itC;
delete regexpA;
delete regexpY;
 
continue;
}
 
// if AY exists, then we can simply do this:
//iterator invalidated, need to backup concat node
UnboundedRegExpElement * tmpItA = itA->get();
node->elements.erase( iterAY );
// iterator invalidated, need to recall before erase
itA = find_if( node->elements.begin( ), node->elements.end( ), [ tmpItA ]( const std::smart_ptr < UnboundedRegExpElement > & a ) -> bool { return *a == *tmpItA; } );
UnboundedRegExpConcatenation * tmpAltered = new UnboundedRegExpConcatenation( );
tmpAltered->elements.push_back( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAltered->elements.push_back( * itC );
tmpAltered->elements.push_back( std::smart_ptr < UnboundedRegExpElement > ( regexpY ) );
UnboundedRegExpElement * regexpAltered = optimize( tmpAltered );
tmpAltered->elements.clear( );
delete tmpAltered;
UnboundedRegExpConcatenation tmpAltered;
tmpAltered.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAltered.pushBackChild( * itC );
tmpAltered.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpY ) );
UnboundedRegExpElement * regexpAltered = optimizeInner( tmpAltered );
 
itA = node->elements.erase( itA );
node.setChild( std::smart_ptr < UnboundedRegExpElement > ( regexpAltered ), itA );
 
node->elements.insert( itA, std::smart_ptr < UnboundedRegExpElement > ( regexpAltered ) );
itA = node.getChildren().erase( iterAY );
 
optimized = true;
break;
}
 
itA ++;
++ itA;
}
 
return optimized; */
return false; // FIXME
return optimized;
}
 
/**
......@@ -781,7 +738,7 @@ bool RegExpOptimize::V6( UnboundedRegExpAlternation & node ) {
regexpY = optimizeInner( tmpY );
}
 
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->getElements() )
// concatenate "a" and "y" and see if they exist somewhere in parent alternation ( node->getChildren() )
UnboundedRegExpConcatenation tmpAY;
tmpAY.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAY.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpY ) );
......@@ -799,14 +756,6 @@ bool RegExpOptimize::V6( UnboundedRegExpAlternation & node ) {
continue;
}
 
// if AY exists, then we can simply do this:
// iterator invalidated, need to backup concat node
UnboundedRegExpElement * tmpItA = itA->get();
node.getChildren().erase( iterAY );
// iterator invalidated, need to recall before erase
itA = find_if( node.getChildren().begin( ), node.getChildren().end( ), [ tmpItA ]( const std::smart_ptr < UnboundedRegExpElement > & a ) -> bool { return *a == *tmpItA; } );
UnboundedRegExpConcatenation tmpAltered;
tmpAltered.pushBackChild( std::smart_ptr < UnboundedRegExpElement > ( regexpA ) );
tmpAltered.pushBackChild( * itC );
......@@ -814,6 +763,9 @@ bool RegExpOptimize::V6( UnboundedRegExpAlternation & node ) {
UnboundedRegExpElement * regexpAltered = optimizeInner( tmpAltered );
 
node.setChild( std::smart_ptr < UnboundedRegExpElement > ( regexpAltered ), itA );
itA = node.getChildren().erase( iterAY );
optimized = true;
break;
}
......
......@@ -20,16 +20,47 @@ void RegExpOptimizeTest::tearDown() {
}
 
void RegExpOptimizeTest::testOptimize() {
{
std::string input = "a+a";
regexp::UnboundedRegExp regexp( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(input).getData() ) );
regexp::UnboundedRegExp res = regexp::simplify::RegExpOptimize::optimize(regexp);
std::string inputRes = "a";
regexp::UnboundedRegExp regexpRes( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(inputRes).getData() ) );
std::cout << res << std::endl;
std::cout << regexpRes << std::endl;
CPPUNIT_ASSERT ( regexpRes == res );
}
{
std::string input = "(a+a)b + (#0 b + (#0 a + (#0 b + a)))";
regexp::UnboundedRegExp regexp( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(input).getData() ) );
 
regexp::UnboundedRegExp res = regexp::simplify::RegExpOptimize::optimize(regexp);
std::string inputRes = "a + ab";
regexp::UnboundedRegExp regexpRes( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(inputRes).getData() ) );
std::cout << res << std::endl;
std::cout << regexpRes << std::endl;
CPPUNIT_ASSERT ( regexpRes == res );
}
{
std::string input = "a+a* (b+a)* c";
std::string input = "a z + a b* b z";
regexp::UnboundedRegExp regexp( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(input).getData() ) );
 
regexp::UnboundedRegExp res = regexp::simplify::RegExpOptimize::optimize(regexp);
std::string inputRes = "a b* z";
regexp::UnboundedRegExp regexpRes( static_cast<const regexp::UnboundedRegExp &>( alib::StringDataFactory::fromString<regexp::RegExp>(inputRes).getData() ) );
std::cout << res << std::endl;
std::cout << regexpRes << std::endl;
CPPUNIT_ASSERT ( regexpRes == res );
}
 
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment