From 14d6dd7606f611231905daceb9e960c23dea94a9 Mon Sep 17 00:00:00 2001 From: Jan Travnicek <jan.travnicek@.fit.cvut.cz> Date: Mon, 18 Mar 2019 15:15:49 +0100 Subject: [PATCH] enable A8 regexp optimize --- .../src/regexp/simplify/RegExpOptimize.h | 8 +- .../simplify/RegExpOptimizeUnboundedPart.hpp | 82 +++++++++++-------- .../regexp/simplify/RegExpOptimizeTest.cpp | 12 +-- .../test-src/tests/regexpOptimizeTest.cpp | 36 ++++++++ 4 files changed, 92 insertions(+), 46 deletions(-) create mode 100644 alib2integrationtest/test-src/tests/regexpOptimizeTest.cpp diff --git a/alib2algo/src/regexp/simplify/RegExpOptimize.h b/alib2algo/src/regexp/simplify/RegExpOptimize.h index 10b969194c..414c8894c0 100644 --- a/alib2algo/src/regexp/simplify/RegExpOptimize.h +++ b/alib2algo/src/regexp/simplify/RegExpOptimize.h @@ -42,8 +42,8 @@ namespace simplify { * - A5 : -> : x(yz) = (xy)z = xyz * - A6 : -> : \ex = x\e = x * - A7 : -> : \0x = x\0 = \0 - * - A8 : -> : x( y + z ) = xy + xz - * - A9 : -> : ( x + y )z = xz + yz + * - A8 : <- : x( y + z ) = xy + xz + * - A9 : <- : ( x + y )z = xz + yz * - A10: <- : x* = \e + x*x * - A11: <- : x* = ( \e + x )* * - V1 : -> : \0* = \e @@ -123,8 +123,8 @@ private: static bool A5( regexp::UnboundedRegExpConcatenation < SymbolType > & node ); static bool A6( regexp::UnboundedRegExpConcatenation < SymbolType > & node ); static bool A7( regexp::UnboundedRegExpConcatenation < SymbolType > & node ); - static bool A8( regexp::UnboundedRegExpConcatenation < SymbolType > & node ); - static bool A9( regexp::UnboundedRegExpConcatenation < SymbolType > & node ); + static bool A8( regexp::UnboundedRegExpAlternation < SymbolType > & node ); + static bool A9( regexp::UnboundedRegExpAlternation < SymbolType > & node ); static bool A10( regexp::UnboundedRegExpAlternation < SymbolType > & node ); static bool A11( regexp::UnboundedRegExpIteration < SymbolType > & node ); static bool V1( regexp::UnboundedRegExpIteration < SymbolType > & node ); diff --git a/alib2algo/src/regexp/simplify/RegExpOptimizeUnboundedPart.hpp b/alib2algo/src/regexp/simplify/RegExpOptimizeUnboundedPart.hpp index 9ee980b87f..95d2501e16 100644 --- a/alib2algo/src/regexp/simplify/RegExpOptimizeUnboundedPart.hpp +++ b/alib2algo/src/regexp/simplify/RegExpOptimizeUnboundedPart.hpp @@ -13,12 +13,14 @@ namespace simplify { template < class SymbolType > void RegExpOptimize::optimize( UnboundedRegExpAlternation < SymbolType > & alt ) { - while ( Unbounded < SymbolType >::A10 ( alt ) || Unbounded < SymbolType >::V2 ( alt ) || Unbounded < SymbolType >::V5 ( alt ) || Unbounded < SymbolType >::V6 ( alt ) ); + while ( Unbounded < SymbolType >::A1( alt ) || Unbounded < SymbolType >::A2( alt ) || Unbounded < SymbolType >::A3( alt ) || Unbounded < SymbolType >::A4( alt ) ); + + while ( Unbounded < SymbolType >::A8( alt ) || Unbounded < SymbolType >::A9( alt ) || Unbounded < SymbolType >::A10 ( alt ) || Unbounded < SymbolType >::V2 ( alt ) || Unbounded < SymbolType >::V5 ( alt ) || Unbounded < SymbolType >::V6 ( alt ) ); for( size_t i = 0; i < alt.getChildren ( ).size ( ); i++ ) alt.setChild ( std::move ( alt.getChild ( i ) ).template accept < ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > >, RegExpOptimize::Unbounded < SymbolType > > ( true ), i ); - while ( Unbounded < SymbolType >::A1( alt ) || Unbounded < SymbolType >::A2( alt ) || Unbounded < SymbolType >::A3( alt ) || Unbounded < SymbolType >::A4( alt ) || Unbounded < SymbolType >::A10( alt ) || Unbounded < SymbolType >::V2( alt ) || Unbounded < SymbolType >::V5( alt ) || Unbounded < SymbolType >::V6( alt ) || Unbounded < SymbolType >::X1( alt ) ); + while ( Unbounded < SymbolType >::A1( alt ) || Unbounded < SymbolType >::A2( alt ) || Unbounded < SymbolType >::A3( alt ) || Unbounded < SymbolType >::A4( alt ) || Unbounded < SymbolType >::A8( alt ) || Unbounded < SymbolType >::A9( alt ) || Unbounded < SymbolType >::A10( alt ) || Unbounded < SymbolType >::V2( alt ) || Unbounded < SymbolType >::V5( alt ) || Unbounded < SymbolType >::V6( alt ) || Unbounded < SymbolType >::X1( alt ) ); for( size_t i = 0; i < alt.getChildren ( ).size ( ); i++ ) alt.setChild ( std::move ( alt.getChild ( i ) ).template accept < ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > >, RegExpOptimize::Unbounded < SymbolType > > ( false ), i ); @@ -26,12 +28,14 @@ void RegExpOptimize::optimize( UnboundedRegExpAlternation < SymbolType > & alt ) template < class SymbolType > void RegExpOptimize::optimize( UnboundedRegExpConcatenation < SymbolType > & concat ) { + while ( Unbounded < SymbolType >::A5( concat ) || Unbounded < SymbolType >::A6( concat ) || Unbounded < SymbolType >::A7( concat ) ); + while ( Unbounded < SymbolType >::V8 ( concat ) || Unbounded < SymbolType >::V8R ( concat ) || Unbounded < SymbolType >::V9( concat ) ); for( size_t i = 0; i < concat.getChildren ( ).size ( ); i++ ) concat.setChild ( std::move ( concat.getChild ( i ) ).template accept < ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > >, RegExpOptimize::Unbounded < SymbolType > > ( true ), i ); - while ( Unbounded < SymbolType >::A5( concat ) || Unbounded < SymbolType >::A6( concat ) || Unbounded < SymbolType >::A7( concat ) || Unbounded < SymbolType >::A8( concat ) || Unbounded < SymbolType >::A9( concat ) || Unbounded < SymbolType >::V8( concat ) || Unbounded < SymbolType >::V8R( concat ) || Unbounded < SymbolType >::V9( concat ) ); + while ( Unbounded < SymbolType >::A5( concat ) || Unbounded < SymbolType >::A6( concat ) || Unbounded < SymbolType >::A7( concat ) || Unbounded < SymbolType >::V8( concat ) || Unbounded < SymbolType >::V8R( concat ) || Unbounded < SymbolType >::V9( concat ) ); for( size_t i = 0; i < concat.getChildren ( ).size ( ); i++ ) concat.setChild ( std::move ( concat.getChild ( i ) ).template accept < ext::ptr_value < regexp::UnboundedRegExpElement < SymbolType > >, RegExpOptimize::Unbounded < SymbolType > > ( false ), i ); @@ -266,45 +270,51 @@ bool RegExpOptimize::Unbounded < SymbolType >::A7( UnboundedRegExpConcatenation * @return bool true if optimization applied else false */ template < class SymbolType > -bool RegExpOptimize::Unbounded < SymbolType >::A8( UnboundedRegExpConcatenation < SymbolType > & /* node */) { -/* - bool optimized = false; +bool RegExpOptimize::Unbounded < SymbolType >::A8( UnboundedRegExpAlternation < SymbolType > & node ) { + std::map < ext::reference_wrapper < UnboundedRegExpElement < SymbolType > >, ext::vector < ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > > > data; - for( auto it = std::next( node->elements.begin( ) ); it != node->elements.end( ); ) - { - UnboundedRegExpAlternation < SymbolType > * alt = dynamic_cast<UnboundedRegExpAlternation < SymbolType >*>( * it ); - if( ! alt ) - { - it ++; - continue; + for ( UnboundedRegExpElement < SymbolType > & element : node ) { + UnboundedRegExpConcatenation < SymbolType > * childConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & element ); + if ( childConcat ) { + data [ ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > ( childConcat->getChild ( 0 ) ) ].push_back ( ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > ( element ) ); + } else { + data [ ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > ( element ) ].push_back ( ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > ( element ) ); } + } - // take everything to the left and copy it as prefix of every element in alternation. - UnboundedRegExpConcatenation < SymbolType > leftPart; - leftPart.elements.insert( leftPart.elements.end( ), node->elements.begin( ), it ); - - for( auto altIt = alt->elements.begin( ); altIt != alt->elements.end( ); altIt ++ ) - { - UnboundedRegExpConcatenation < SymbolType > * altElem = new UnboundedRegExpConcatenation < SymbolType >( ); - altElem->elements.push_back( leftPart ); - altElem->elements.push_back( * altIt ); + if ( data.size ( ) == node.getChildren ( ).size ( ) ) + return false; - * altIt = altElem; + UnboundedRegExpAlternation < SymbolType > res; + for ( std::pair < ext::reference_wrapper < UnboundedRegExpElement < SymbolType > >, ext::vector < ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > > > && entry : ext::make_mover ( data ) ) { + if ( entry.second.size ( ) == 1 ) { + res.appendElement ( std::move ( entry.second.front ( ).get ( ) ) ); + } else { + UnboundedRegExpConcatenation < SymbolType > innerConcat; + innerConcat.appendElement ( std::move ( entry.first.get ( ) ) ); + UnboundedRegExpAlternation < SymbolType > innerAlt; + for ( ext::reference_wrapper < UnboundedRegExpElement < SymbolType > > & innerEntry : entry.second ) { + UnboundedRegExpElement < SymbolType > & innerEntryElement = innerEntry.get ( ); + UnboundedRegExpConcatenation < SymbolType > * innerEntryConcat = dynamic_cast < UnboundedRegExpConcatenation < SymbolType > * > ( & innerEntryElement ); + if ( innerEntryConcat ) { + if ( innerEntryConcat->getElements ( ).size ( ) == 1 ) { + innerAlt.appendElement ( UnboundedRegExpEpsilon < SymbolType > ( ) ); + } else { + innerEntryConcat->erase ( innerEntryConcat->begin ( ) ); + innerAlt.appendElement ( std::move ( * innerEntryConcat ) ); + } + } else { + innerAlt.appendElement ( UnboundedRegExpEpsilon < SymbolType > ( ) ); + } + } + innerConcat.appendElement ( std::move ( innerAlt ) ); + res.appendElement ( Unbounded < SymbolType >::visit ( std::move ( innerConcat ), true ) ); } - - UnboundedRegExpElement < SymbolType > * optIt = optimize( * it ); - delete *it; - *it = optIt; - - it = node->elements.erase( node->elements.begin( ), it ); - - optimized = true; - it ++; } - return optimized; -*/ - return false; //TODO + node = res; + + return false; } /** @@ -313,7 +323,7 @@ bool RegExpOptimize::Unbounded < SymbolType >::A8( UnboundedRegExpConcatenation * @return bool true if optimization applied else false */ template < class SymbolType > -bool RegExpOptimize::Unbounded < SymbolType >::A9( UnboundedRegExpConcatenation < SymbolType > & /* node */) { +bool RegExpOptimize::Unbounded < SymbolType >::A9( UnboundedRegExpAlternation < SymbolType > & /* node */) { /* bool optimized = false; diff --git a/alib2algo/test-src/regexp/simplify/RegExpOptimizeTest.cpp b/alib2algo/test-src/regexp/simplify/RegExpOptimizeTest.cpp index 747477283e..cff6baacc3 100644 --- a/alib2algo/test-src/regexp/simplify/RegExpOptimizeTest.cpp +++ b/alib2algo/test-src/regexp/simplify/RegExpOptimizeTest.cpp @@ -13,7 +13,7 @@ TEST_CASE ( "RegExp Optimize", "[unit][algo][regexp][simplify]" ) { SECTION ( "Test simple axioms" ) { auto testcase = GENERATE ( std::make_pair ("a+a", "a" ), - std::make_pair ( "(a+a)b + (#0 b + (#0 a + (#0 b + a)))", "a + a b" ), + std::make_pair ( "(a+a)b + (#0 b + (#0 a + (#0 b + a)))", "a ( b + #E )" ), //std::make_pair ( "a z + a b* b z", "a b* z" ), std::make_pair ( "a***", "a*" ), std::make_pair ( "(a*+b*)*", "(a+b)*" ), @@ -95,14 +95,14 @@ TEST_CASE ( "RegExp Optimize", "[unit][algo][regexp][simplify]" ) { regexp::UnboundedRegExp < char > res = regexp::simplify::RegExpOptimize::optimize ( regexp ); + regexp::UnboundedRegExpAlternation < char > alt6; + alt6.appendElement ( b ); + alt6.appendElement ( regexp::UnboundedRegExpEpsilon < char > { } ); regexp::UnboundedRegExpConcatenation < char > con6; con6.appendElement ( a ); - con6.appendElement ( b ); - regexp::UnboundedRegExpAlternation < char > alt6; - alt6.appendElement ( a ); - alt6.appendElement ( con6 ); + con6.appendElement ( alt6 ); - regexp::UnboundedRegExp < char > regexpRes( regexp::UnboundedRegExpStructure < char > { alt6 } ); + regexp::UnboundedRegExp < char > regexpRes( regexp::UnboundedRegExpStructure < char > { con6 } ); CAPTURE ( regexp, res, regexpRes ); CHECK ( regexpRes == res ); diff --git a/alib2integrationtest/test-src/tests/regexpOptimizeTest.cpp b/alib2integrationtest/test-src/tests/regexpOptimizeTest.cpp new file mode 100644 index 0000000000..28b250f97a --- /dev/null +++ b/alib2integrationtest/test-src/tests/regexpOptimizeTest.cpp @@ -0,0 +1,36 @@ +#include <catch2/catch.hpp> +#include <alib/vector> + +#include "testing/TimeoutAqlTest.hpp" +#include "testing/TestFiles.hpp" + +const unsigned LEAF_NODES = 10; +const unsigned HEIGHT = 6; +const double ALPHABET_SIZE = 2; +const size_t ITERATIONS = 100; + +const std::string qGenRE ( ) { + std::ostringstream oss; + oss << "execute regexp::generate::RandomRegExpFactory "; + oss << "(size_t)" << rand ( ) % LEAF_NODES + 1 << " "; + oss << "(size_t)" << rand ( ) % HEIGHT + 1 << " "; + oss << "(size_t)" << ALPHABET_SIZE << " "; + oss << "(bool)false "; + return oss.str ( ); +} + +TEST_CASE ( "RE optimize test", "[integration]" ) { + static const std::string qMinimize ( "automaton::simplify::efficient::EpsilonRemoverIncoming - | automaton::determinize::Determinize - | " + "automaton::simplify::Trim - | automaton::simplify::Minimize - | automaton::simplify::Normalize -" ); + + SECTION ( "Random tests" ) { + for ( size_t i = 0; i < ITERATIONS; i++ ) { + ext::vector < std::string > qs = { + qGenRE ( ) + " > $gen", + "execute string::Compose $gen > /tmp/file", + "quit compare::AutomatonCompare <( $gen | regexp::convert::ToAutomaton - | " + qMinimize + " )" + " <( $gen | regexp::simplify::RegExpOptimize - | regexp::convert::ToAutomaton - | " + qMinimize + ")" + }; + TimeoutAqlTest ( 10s, qs ); + } + } +} -- GitLab