From 4b02ab228ab1682273613d684b8a60a102a816a7 Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Tue, 9 Jan 2018 11:46:09 +0100 Subject: [PATCH] introduce SuffixAutomaton as wrapper of DFA --- .../query/FullAndLinearIndexPatterns.h | 8 +- .../NonlinearFullAndLinearIndexPatterns.h | 8 +- .../indexing/ExactSuffixAutomaton.cpp | 2 +- .../indexing/ExactSuffixAutomaton.h | 8 +- .../matching/DAWGMatcherConstruction.cpp | 4 +- .../matching/DAWGMatcherConstruction.h | 4 +- .../matching/OracleMatcherConstruction.cpp | 2 +- .../query/BackwardDAWGMatching.cpp | 2 +- .../stringology/query/BackwardDAWGMatching.h | 20 +- .../query/SuffixAutomatonFactors.cpp | 2 +- .../query/SuffixAutomatonFactors.h | 21 +- .../indexing/ExactSuffixAutomatonTest.cpp | 36 +-- .../stringology/query/DAWGMatcherTest.cpp | 4 +- .../indexes/stringology/SuffixAutomaton.cpp | 22 ++ .../src/indexes/stringology/SuffixAutomaton.h | 217 ++++++++++++++++++ 15 files changed, 300 insertions(+), 60 deletions(-) create mode 100644 alib2data/src/indexes/stringology/SuffixAutomaton.cpp create mode 100644 alib2data/src/indexes/stringology/SuffixAutomaton.h diff --git a/alib2algo/src/arbology/query/FullAndLinearIndexPatterns.h b/alib2algo/src/arbology/query/FullAndLinearIndexPatterns.h index 3df6c8bc89..96c502815c 100644 --- a/alib2algo/src/arbology/query/FullAndLinearIndexPatterns.h +++ b/alib2algo/src/arbology/query/FullAndLinearIndexPatterns.h @@ -80,14 +80,14 @@ ext::set < unsigned > FullAndLinearIndexPatterns::query ( const indexes::arbolog } } - ext::vector < std::pair < unsigned, unsigned > > prevOcc = FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ) , treePatternParts [ 0 ] ); + ext::vector < std::pair < unsigned, unsigned > > prevOcc = FullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ) , treePatternParts [ 0 ] ); for ( unsigned i = 1; i < treePatternParts.size ( ); ++ i ) { for ( std::pair < unsigned, unsigned > & occurrence : prevOcc ) occurrence.second = fullAndLinearIndex.getJumps ( ) [ occurrence.second ]; if ( ! treePatternParts [ i ].empty ( ) ) - prevOcc = MergeOccurrences ( prevOcc, FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); + prevOcc = MergeOccurrences ( prevOcc, FullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); } ext::set < unsigned > res; @@ -113,14 +113,14 @@ ext::set < unsigned > FullAndLinearIndexPatterns::query ( const indexes::arbolog } } - ext::vector < std::pair < unsigned, unsigned > > prevOcc = FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ 0 ] ); + ext::vector < std::pair < unsigned, unsigned > > prevOcc = FullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ 0 ] ); for ( unsigned i = 1; i < treePatternParts.size ( ); ++ i ) { for ( std::pair < unsigned, unsigned > & occurrence : prevOcc ) occurrence.second = fullAndLinearIndex.getJumps ( ) [ occurrence.second ]; if ( ! treePatternParts [ i ].empty ( ) ) - prevOcc = MergeOccurrences ( prevOcc, FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); + prevOcc = MergeOccurrences ( prevOcc, FullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); } ext::set < unsigned > res; diff --git a/alib2algo/src/arbology/query/NonlinearFullAndLinearIndexPatterns.h b/alib2algo/src/arbology/query/NonlinearFullAndLinearIndexPatterns.h index e3a2cca90c..3413dd4f79 100644 --- a/alib2algo/src/arbology/query/NonlinearFullAndLinearIndexPatterns.h +++ b/alib2algo/src/arbology/query/NonlinearFullAndLinearIndexPatterns.h @@ -83,7 +83,7 @@ ext::set < unsigned > NonlinearFullAndLinearIndexPatterns::query ( const indexes } } - ext::vector < std::pair < unsigned, unsigned > > prevOcc = FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ 0 ] ); + ext::vector < std::pair < unsigned, unsigned > > prevOcc = NonlinearFullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ 0 ] ); for ( unsigned i = 1; i < treePatternParts.size ( ); ++ i ) { for ( std::pair < unsigned, unsigned > & occurrence : prevOcc ) { @@ -100,7 +100,7 @@ ext::set < unsigned > NonlinearFullAndLinearIndexPatterns::query ( const indexes ++ i; if ( ! treePatternParts [ i ].empty ( ) ) - prevOcc = MergeOccurrences ( prevOcc, FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); + prevOcc = MergeOccurrences ( prevOcc, NonlinearFullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); } ext::set < unsigned > res; @@ -132,7 +132,7 @@ ext::set < unsigned > NonlinearFullAndLinearIndexPatterns::query ( const indexes } } - ext::vector < std::pair < unsigned, unsigned > > prevOcc = FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ) , treePatternParts [ 0 ] ); + ext::vector < std::pair < unsigned, unsigned > > prevOcc = NonlinearFullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ 0 ] ); for ( unsigned i = 1; i < treePatternParts.size ( ); ++ i ) { for ( std::pair < unsigned, unsigned > & occurrence : prevOcc ) { @@ -149,7 +149,7 @@ ext::set < unsigned > NonlinearFullAndLinearIndexPatterns::query ( const indexes ++ i; if ( ! treePatternParts [ i ].empty ( ) ) - prevOcc = MergeOccurrences ( prevOcc, FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); + prevOcc = MergeOccurrences ( prevOcc, NonlinearFullAndLinearIndexPatterns::FindOccurrences < StringIndexQueryAlgo > ( fullAndLinearIndex.getStringIndex ( ), treePatternParts [ i ] ), rev ); } ext::set < unsigned > res; diff --git a/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.cpp b/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.cpp index e0081a874e..76bd9f35be 100644 --- a/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.cpp +++ b/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.cpp @@ -7,6 +7,6 @@ namespace { -auto SuffixAutomatonLinearString = registration::AbstractRegister < stringology::indexing::ExactSuffixAutomaton, automaton::DFA < DefaultSymbolType, unsigned >, const string::LinearString < > & > ( stringology::indexing::ExactSuffixAutomaton::construct ); +auto SuffixAutomatonLinearString = registration::AbstractRegister < stringology::indexing::ExactSuffixAutomaton, indexes::stringology::SuffixAutomaton < >, const string::LinearString < > & > ( stringology::indexing::ExactSuffixAutomaton::construct ); } /* namespace */ diff --git a/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.h b/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.h index b057bc747c..64c524cfb9 100644 --- a/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.h +++ b/alib2algo/src/stringology/indexing/ExactSuffixAutomaton.h @@ -5,7 +5,7 @@ #ifndef _EXACT_SUFFIX_AUTOMATON_H_ #define _EXACT_SUFFIX_AUTOMATON_H_ -#include <automaton/FSM/DFA.h> +#include <indexes/stringology/SuffixAutomaton.h> #include <string/LinearString.h> #include <global/GlobalData.h> @@ -21,12 +21,12 @@ private: public: template < class SymbolType > - static automaton::DFA < SymbolType, unsigned > construct ( const string::LinearString < SymbolType > & pattern ); + static indexes::stringology::SuffixAutomaton < SymbolType > construct ( const string::LinearString < SymbolType > & pattern ); }; template < class SymbolType > -automaton::DFA < SymbolType, unsigned > ExactSuffixAutomaton::construct ( const string::LinearString < SymbolType > & pattern ) { +indexes::stringology::SuffixAutomaton < SymbolType > ExactSuffixAutomaton::construct ( const string::LinearString < SymbolType > & pattern ) { automaton::DFA < SymbolType, unsigned > suffixAutomaton ( 0 ); suffixAutomaton.setInputAlphabet ( pattern.getAlphabet ( ) ); @@ -49,7 +49,7 @@ automaton::DFA < SymbolType, unsigned > ExactSuffixAutomaton::construct ( const lastState = suffixLinks [ lastState ].first; } - return suffixAutomaton; + return indexes::stringology::SuffixAutomaton < SymbolType > ( std::move ( suffixAutomaton ) ); } template < class SymbolType > diff --git a/alib2algo/src/stringology/matching/DAWGMatcherConstruction.cpp b/alib2algo/src/stringology/matching/DAWGMatcherConstruction.cpp index 90231dfb04..7c537890ff 100644 --- a/alib2algo/src/stringology/matching/DAWGMatcherConstruction.cpp +++ b/alib2algo/src/stringology/matching/DAWGMatcherConstruction.cpp @@ -10,7 +10,7 @@ namespace stringology { namespace matching { -automaton::DFA < DefaultSymbolType, unsigned > DAWGMatcherConstruction::construct ( const string::LinearString < > & pattern ) { +indexes::stringology::SuffixAutomaton < DefaultSymbolType > DAWGMatcherConstruction::construct ( const string::LinearString < > & pattern ) { auto patternData = pattern.getContent ( ); reverse ( patternData.begin ( ), patternData.end ( ) ); string::LinearString < > reversedPattern ( pattern.getAlphabet ( ), std::move ( patternData ) ); @@ -18,7 +18,7 @@ automaton::DFA < DefaultSymbolType, unsigned > DAWGMatcherConstruction::construc return stringology::indexing::ExactSuffixAutomaton::construct ( reversedPattern ); } -auto DAWGMatcherConstructionLinearString = registration::AbstractRegister < DAWGMatcherConstruction, automaton::DFA < DefaultSymbolType, unsigned >, const string::LinearString < > & > ( DAWGMatcherConstruction::construct ); +auto DAWGMatcherConstructionLinearString = registration::AbstractRegister < DAWGMatcherConstruction, indexes::stringology::SuffixAutomaton < DefaultSymbolType >, const string::LinearString < > & > ( DAWGMatcherConstruction::construct ); } /* namespace matching */ diff --git a/alib2algo/src/stringology/matching/DAWGMatcherConstruction.h b/alib2algo/src/stringology/matching/DAWGMatcherConstruction.h index b3f6392ee5..0ddb9eedc9 100644 --- a/alib2algo/src/stringology/matching/DAWGMatcherConstruction.h +++ b/alib2algo/src/stringology/matching/DAWGMatcherConstruction.h @@ -5,7 +5,7 @@ #ifndef DAWG_MATCHER_CONSTRUCTION_H_ #define DAWG_MATCHER_CONSTRUCTION_H_ -#include <automaton/FSM/DFA.h> +#include <indexes/stringology/SuffixAutomaton.h> #include <string/LinearString.h> namespace stringology { @@ -18,7 +18,7 @@ public: * Linear time on-line construction of minimal suffix automaton for given pattern. * @return minimal suffix automaton for given pattern. */ - static automaton::DFA < DefaultSymbolType, unsigned > construct ( const string::LinearString < > & pattern ); + static indexes::stringology::SuffixAutomaton < > construct ( const string::LinearString < > & pattern ); }; diff --git a/alib2algo/src/stringology/matching/OracleMatcherConstruction.cpp b/alib2algo/src/stringology/matching/OracleMatcherConstruction.cpp index 8f27209ffa..d56866487e 100644 --- a/alib2algo/src/stringology/matching/OracleMatcherConstruction.cpp +++ b/alib2algo/src/stringology/matching/OracleMatcherConstruction.cpp @@ -15,7 +15,7 @@ automaton::DFA < DefaultSymbolType, unsigned > OracleMatcherConstruction::constr reverse ( patternData.begin ( ), patternData.end ( ) ); string::LinearString < > reversedPattern ( pattern.getAlphabet ( ), std::move ( patternData ) ); - return stringology::indexing::ExactSuffixAutomaton::construct ( reversedPattern ); + return stringology::indexing::ExactSuffixAutomaton::construct ( reversedPattern ).getAutomaton ( ); // FIXME this is not oracle automaton } auto OracleMatcherConstructionLinearString = registration::AbstractRegister < OracleMatcherConstruction, automaton::DFA < DefaultSymbolType, unsigned >, const string::LinearString < > & > ( OracleMatcherConstruction::construct ); diff --git a/alib2algo/src/stringology/query/BackwardDAWGMatching.cpp b/alib2algo/src/stringology/query/BackwardDAWGMatching.cpp index a1ea591b82..e4ff9a4195 100644 --- a/alib2algo/src/stringology/query/BackwardDAWGMatching.cpp +++ b/alib2algo/src/stringology/query/BackwardDAWGMatching.cpp @@ -9,7 +9,7 @@ namespace stringology { namespace query { -auto BackwardDAWGMatchingLinearStringLinearString = registration::AbstractRegister < BackwardDAWGMatching, ext::set < unsigned >, const string::LinearString < > &, const automaton::DFA < > & > ( BackwardDAWGMatching::match ); +auto BackwardDAWGMatchingLinearStringLinearString = registration::AbstractRegister < BackwardDAWGMatching, ext::set < unsigned >, const string::LinearString < > &, const indexes::stringology::SuffixAutomaton < > & > ( BackwardDAWGMatching::match ); } /* namespace query */ diff --git a/alib2algo/src/stringology/query/BackwardDAWGMatching.h b/alib2algo/src/stringology/query/BackwardDAWGMatching.h index 57b49348de..9094f72e2b 100644 --- a/alib2algo/src/stringology/query/BackwardDAWGMatching.h +++ b/alib2algo/src/stringology/query/BackwardDAWGMatching.h @@ -7,7 +7,7 @@ #include <alib/set> -#include <automaton/FSM/DFA.h> +#include <indexes/stringology/SuffixAutomaton.h> #include <string/LinearString.h> #include <stringology/properties/BackboneLength.h> @@ -25,32 +25,32 @@ public: * Search for pattern in linear string. * @return set set of occurences */ - template < class SymbolType, class StateType > - static ext::set < unsigned > match ( const string::LinearString < SymbolType > & subject, const automaton::DFA < SymbolType, StateType > & suffixAutomaton ); + template < class SymbolType > + static ext::set < unsigned > match ( const string::LinearString < SymbolType > & subject, const indexes::stringology::SuffixAutomaton < SymbolType > & suffixAutomaton ); }; -template < class SymbolType, class StateType > -ext::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString < SymbolType > & subject, const automaton::DFA < SymbolType, StateType > & suffixAutomaton ) { +template < class SymbolType > +ext::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString < SymbolType > & subject, const indexes::stringology::SuffixAutomaton < SymbolType > & suffixAutomaton ) { ext::set < unsigned > occ; - size_t patternSize = stringology::properties::BackboneLength::length ( suffixAutomaton ); + size_t patternSize = stringology::properties::BackboneLength::length ( suffixAutomaton.getAutomaton ( ) ); size_t subjectSize = subject.getContent ( ).size ( ); bool fail; size_t posInSubject = 0; while ( posInSubject <= subjectSize - patternSize ) { - StateType currentState = suffixAutomaton.getInitialState ( ); + unsigned currentState = suffixAutomaton.getAutomaton ( ).getInitialState ( ); size_t posInPattern = patternSize; size_t lastPrefixPos = posInPattern; fail = false; while ( posInPattern > 0 && ! fail ) { - auto transition = suffixAutomaton.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } ); + auto transition = suffixAutomaton.getAutomaton ( ).getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } ); - if ( transition == suffixAutomaton.getTransitions ( ).end ( ) ) + if ( transition == suffixAutomaton.getAutomaton ( ).getTransitions ( ).end ( ) ) fail = true; else currentState = transition->second; @@ -58,7 +58,7 @@ ext::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString < posInPattern--; // found a prefix of nonreversed pattern that does not correspond to the entire pattern - if ( ( posInPattern != 0 ) && ( suffixAutomaton.getFinalStates ( ).find ( currentState ) != suffixAutomaton.getFinalStates ( ).end ( ) ) ) + if ( ( posInPattern != 0 ) && ( suffixAutomaton.getAutomaton ( ).getFinalStates ( ).find ( currentState ) != suffixAutomaton.getAutomaton ( ).getFinalStates ( ).end ( ) ) ) lastPrefixPos = posInPattern; } diff --git a/alib2algo/src/stringology/query/SuffixAutomatonFactors.cpp b/alib2algo/src/stringology/query/SuffixAutomatonFactors.cpp index 7d457596d7..8abba44d24 100644 --- a/alib2algo/src/stringology/query/SuffixAutomatonFactors.cpp +++ b/alib2algo/src/stringology/query/SuffixAutomatonFactors.cpp @@ -12,7 +12,7 @@ namespace stringology { namespace query { -auto SuffixAutomatonFactorsLinearString = registration::AbstractRegister < SuffixAutomatonFactors, ext::set < unsigned >, const automaton::DFA < > &, const string::LinearString < > & > ( SuffixAutomatonFactors::query ); +auto SuffixAutomatonFactorsLinearString = registration::AbstractRegister < SuffixAutomatonFactors, ext::set < unsigned >, const indexes::stringology::SuffixAutomaton < > &, const string::LinearString < > & > ( SuffixAutomatonFactors::query ); } /* namespace query */ diff --git a/alib2algo/src/stringology/query/SuffixAutomatonFactors.h b/alib2algo/src/stringology/query/SuffixAutomatonFactors.h index 42a0340b78..978a86b65f 100644 --- a/alib2algo/src/stringology/query/SuffixAutomatonFactors.h +++ b/alib2algo/src/stringology/query/SuffixAutomatonFactors.h @@ -8,7 +8,7 @@ #ifndef SUFFIX_AUTOMATON_FACTORS_H_ #define SUFFIX_AUTOMATON_FACTORS_H_ -#include <automaton/FSM/DFA.h> +#include <indexes/stringology/SuffixAutomaton.h> #include <string/LinearString.h> #include <automaton/run/Run.h> @@ -32,31 +32,30 @@ public: * @param string string to query by * @return occurences of factors */ - template < class SymbolType, class StateType > - static ext::set < unsigned > query ( const automaton::DFA < SymbolType, StateType > & suffixAutomaton, const string::LinearString < SymbolType > & string ); - + template < class SymbolType > + static ext::set < unsigned > query ( const indexes::stringology::SuffixAutomaton < SymbolType > & suffixAutomaton, const string::LinearString < SymbolType > & string ); }; -template < class SymbolType, class StateType > -ext::set < unsigned > SuffixAutomatonFactors::query ( const automaton::DFA < SymbolType, StateType > & suffixAutomaton, const string::LinearString < SymbolType > & string ) { - ext::tuple < bool, StateType, ext::set < unsigned > > run = automaton::run::Run::calculateState ( suffixAutomaton, string ); +template < class SymbolType > +ext::set < unsigned > SuffixAutomatonFactors::query ( const indexes::stringology::SuffixAutomaton < SymbolType > & suffixAutomaton, const string::LinearString < SymbolType > & string ) { + ext::tuple < bool, unsigned, ext::set < unsigned > > run = automaton::run::Run::calculateState ( suffixAutomaton.getAutomaton ( ), string ); if ( ! std::get < 0 > ( run ) ) return { }; - std::deque < std::pair < StateType, unsigned > > open = { { std::get < 1 > ( run ), 0u } }; + std::deque < std::pair < unsigned, unsigned > > open = { { std::get < 1 > ( run ), 0u } }; ext::vector < unsigned > tmp; unsigned max = 0; while ( ! open.empty ( ) ) { - std::pair < StateType, unsigned > cur = std::move ( open.back ( ) ); + std::pair < unsigned, unsigned > cur = std::move ( open.back ( ) ); open.pop_back ( ); - if ( suffixAutomaton.getFinalStates ( ).count ( cur.first ) ) + if ( suffixAutomaton.getAutomaton ( ).getFinalStates ( ).count ( cur.first ) ) tmp.push_back ( cur.second ); if ( cur.second > max ) max = cur.second; - for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( cur.first ) ) + for ( const auto & transition : suffixAutomaton.getAutomaton ( ).getTransitionsFromState ( cur.first ) ) open.emplace_back ( transition.second, cur.second + 1 ); } diff --git a/alib2algo/test-src/stringology/indexing/ExactSuffixAutomatonTest.cpp b/alib2algo/test-src/stringology/indexing/ExactSuffixAutomatonTest.cpp index 61587da32c..d5f5c279f0 100644 --- a/alib2algo/test-src/stringology/indexing/ExactSuffixAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/indexing/ExactSuffixAutomatonTest.cpp @@ -16,29 +16,31 @@ void ExactSuffixAutomatonTest::testSuffixAutomatonConstruction ( ) { string::LinearString < > pattern ( "atatac" ); - automaton::DFA < DefaultSymbolType, unsigned > suffixAutomaton = stringology::indexing::ExactSuffixAutomaton::construct ( pattern ); + indexes::stringology::SuffixAutomaton < > suffixAutomaton = stringology::indexing::ExactSuffixAutomaton::construct ( pattern ); - automaton::DFA < DefaultSymbolType, unsigned > refSuffixAutomaton ( 0 ); + automaton::DFA < DefaultSymbolType, unsigned > tmp ( 0 ); - refSuffixAutomaton.setInputAlphabet ( pattern.getAlphabet ( ) ); + tmp.setInputAlphabet ( pattern.getAlphabet ( ) ); for ( unsigned i = 1; i <= 6; ++i ) { - refSuffixAutomaton.addState ( i ); + tmp.addState ( i ); } - refSuffixAutomaton.addFinalState ( 0 ); - refSuffixAutomaton.addFinalState ( 6 ); - - refSuffixAutomaton.addTransition ( 0, DefaultSymbolType ( 'a' ), 1 ); - refSuffixAutomaton.addTransition ( 0, DefaultSymbolType ( 't' ), 2 ); - refSuffixAutomaton.addTransition ( 0, DefaultSymbolType ( 'c' ), 6 ); - refSuffixAutomaton.addTransition ( 1, DefaultSymbolType ( 't' ), 2 ); - refSuffixAutomaton.addTransition ( 1, DefaultSymbolType ( 'c' ), 6 ); - refSuffixAutomaton.addTransition ( 2, DefaultSymbolType ( 'a' ), 3 ); - refSuffixAutomaton.addTransition ( 3, DefaultSymbolType ( 't' ), 4 ); - refSuffixAutomaton.addTransition ( 3, DefaultSymbolType ( 'c' ), 6 ); - refSuffixAutomaton.addTransition ( 4, DefaultSymbolType ( 'a' ), 5 ); - refSuffixAutomaton.addTransition ( 5, DefaultSymbolType ( 'c' ), 6 ); + tmp.addFinalState ( 0 ); + tmp.addFinalState ( 6 ); + + tmp.addTransition ( 0, DefaultSymbolType ( 'a' ), 1 ); + tmp.addTransition ( 0, DefaultSymbolType ( 't' ), 2 ); + tmp.addTransition ( 0, DefaultSymbolType ( 'c' ), 6 ); + tmp.addTransition ( 1, DefaultSymbolType ( 't' ), 2 ); + tmp.addTransition ( 1, DefaultSymbolType ( 'c' ), 6 ); + tmp.addTransition ( 2, DefaultSymbolType ( 'a' ), 3 ); + tmp.addTransition ( 3, DefaultSymbolType ( 't' ), 4 ); + tmp.addTransition ( 3, DefaultSymbolType ( 'c' ), 6 ); + tmp.addTransition ( 4, DefaultSymbolType ( 'a' ), 5 ); + tmp.addTransition ( 5, DefaultSymbolType ( 'c' ), 6 ); + + indexes::stringology::SuffixAutomaton < > refSuffixAutomaton ( std::move ( tmp ) ); std::cout << suffixAutomaton << std::endl; std::cout << refSuffixAutomaton << std::endl; diff --git a/alib2algo/test-src/stringology/query/DAWGMatcherTest.cpp b/alib2algo/test-src/stringology/query/DAWGMatcherTest.cpp index 254c39a8ca..0aafbfbb75 100644 --- a/alib2algo/test-src/stringology/query/DAWGMatcherTest.cpp +++ b/alib2algo/test-src/stringology/query/DAWGMatcherTest.cpp @@ -38,7 +38,7 @@ void DAWGMatcherTest::testBackwardDAWGMatching ( ) { for(size_t i = 0; i < subjects.size(); ++i) { string::LinearString < > subject ( subjects[i] ); string::LinearString < > pattern ( patterns[i] ); - automaton::DFA < DefaultSymbolType, unsigned > suffixAutomaton = stringology::matching::DAWGMatcherConstruction::construct ( pattern ); + indexes::stringology::SuffixAutomaton < > suffixAutomaton = stringology::matching::DAWGMatcherConstruction::construct ( pattern ); ext::set < unsigned > res = stringology::query::BackwardDAWGMatching::match ( subject, suffixAutomaton ); std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; CPPUNIT_ASSERT ( res == expectedOccs[i] ); @@ -46,7 +46,7 @@ void DAWGMatcherTest::testBackwardDAWGMatching ( ) { auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true); auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 5, longSubject ); - automaton::DFA < DefaultSymbolType, unsigned > suffixAutomaton = stringology::matching::DAWGMatcherConstruction::construct ( longPattern ); + indexes::stringology::SuffixAutomaton < > suffixAutomaton = stringology::matching::DAWGMatcherConstruction::construct ( longPattern ); ext::set < unsigned > res = stringology::query::BackwardDAWGMatching::match ( longSubject, suffixAutomaton ); ext::set < unsigned > resRef = stringology::exact::ExactFactorMatch::match ( longSubject, longPattern ); std::cout << "long: " << res << std::endl; diff --git a/alib2data/src/indexes/stringology/SuffixAutomaton.cpp b/alib2data/src/indexes/stringology/SuffixAutomaton.cpp new file mode 100644 index 0000000000..691a72c799 --- /dev/null +++ b/alib2data/src/indexes/stringology/SuffixAutomaton.cpp @@ -0,0 +1,22 @@ +/* + * SuffixAutomaton.cpp + * + * Created on: Jan 8, 2017 + * Author: Jan Travnicek + */ + +#include "SuffixAutomaton.h" + +#include <registration/ValuePrinterRegistration.hpp> +#include <registration/XmlRegistration.hpp> + +namespace { + +static auto valuePrinter = registration::ValuePrinterRegister < indexes::stringology::SuffixAutomaton < > > ( ); + +static auto xmlWrite = registration::XmlWriterRegister < indexes::stringology::SuffixAutomaton < > > ( ); +static auto xmlRead = registration::XmlReaderRegister < indexes::stringology::SuffixAutomaton < > > ( ); + +static auto xmlGroup = registration::XmlRegisterTypeInGroup < object::Object, indexes::stringology::SuffixAutomaton < > > ( ); + +} /* namespace */ diff --git a/alib2data/src/indexes/stringology/SuffixAutomaton.h b/alib2data/src/indexes/stringology/SuffixAutomaton.h new file mode 100644 index 0000000000..40a9b597aa --- /dev/null +++ b/alib2data/src/indexes/stringology/SuffixAutomaton.h @@ -0,0 +1,217 @@ +/* + * SuffixAutomaton.h + * + * Created on: Jan 8, 2017 + * Author: Jan Travnicek + */ + +#ifndef SUFFIX_AUTOMATON_H_ +#define SUFFIX_AUTOMATON_H_ + +#include <alib/string> +#include <alib/set> +#include <alib/iostream> +#include <sstream> + +#include <common/DefaultSymbolType.h> + +#include <object/UniqueObject.h> +#include <object/ObjectBase.h> + +#include <sax/FromXMLParserHelper.h> +#include <core/xmlApi.hpp> +#include <primitive/Unsigned.h> +#include <primitive/xml/Unsigned.h> + +#include <alphabet/common/SymbolNormalize.h> + +#include <automaton/FSM/DFA.h> + +namespace indexes { + +namespace stringology { + +class GeneralAlphabet; + +/** + * Represents regular expression parsed from the XML. Regular expression is stored + * as a tree of RegExpElement. + */ +template < class SymbolType = DefaultSymbolType > +class SuffixAutomaton final : public object::ObjectBase { +protected: + automaton::DFA < SymbolType, unsigned > m_automaton; + +public: + /** + * @copydoc SuffixTrieNode::clone() const + */ + virtual ObjectBase * clone ( ) const; + + /** + * @copydoc SuffixTrieNode::plunder() const + */ + virtual ObjectBase * plunder ( ) &&; + + explicit SuffixAutomaton ( automaton::DFA < SymbolType, unsigned > automaton ); + + /** + * @return Root node of the trie + */ + const automaton::DFA < SymbolType, unsigned > & getAutomaton ( ) const &; + + automaton::DFA < SymbolType, unsigned > && getAutomaton ( ) &&; + + const ext::set < SymbolType > & getAlphabet ( ) const & { + return m_automaton.getInputAlphabet ( ); + } + + ext::set < SymbolType > && getAlphabet ( ) && { + return std::move ( m_automaton ).getInputAlphabet ( ); + } + + /** + * Removes symbol from the alphabet of symbol available in the regular expression + * @param symbol removed symbol from the alphabet + */ + bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { + return m_automaton.removeInputSymbol ( symbol ); + } + + /** + * Prints XML representation of the tree to the output stream. + * @param out output stream to which print the tree + * @param tree tree to print + */ + virtual void operator >>( std::ostream & out ) const; + + virtual int compare ( const ObjectBase & other ) const { + if ( ext::type_index ( typeid ( * this ) ) == ext::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); + + return ext::type_index ( typeid ( * this ) ) - ext::type_index ( typeid ( other ) ); + } + + virtual int compare ( const SuffixAutomaton & other ) const; + + virtual explicit operator std::string ( ) const; + + virtual object::ObjectBase * inc ( ) &&; + + typedef SuffixAutomaton < > normalized_type; +}; + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace indexes { + +namespace stringology { + +template < class SymbolType > +SuffixAutomaton < SymbolType >::SuffixAutomaton ( automaton::DFA < SymbolType, unsigned > automaton ) : m_automaton ( std::move ( automaton ) ) { +} + +template < class SymbolType > +object::ObjectBase * SuffixAutomaton < SymbolType >::clone ( ) const { + return new SuffixAutomaton ( * this ); +} + +template < class SymbolType > +object::ObjectBase * SuffixAutomaton < SymbolType >::plunder ( ) && { + return new SuffixAutomaton ( std::move ( * this ) ); +} + +template < class SymbolType > +const automaton::DFA < SymbolType, unsigned > & SuffixAutomaton < SymbolType >::getAutomaton ( ) const & { + return m_automaton; +} + +template < class SymbolType > +automaton::DFA < SymbolType, unsigned > && SuffixAutomaton < SymbolType >::getAutomaton ( ) && { + return std::move ( m_automaton ); +} + +template < class SymbolType > +void SuffixAutomaton < SymbolType >::operator >>( std::ostream & out ) const { + out << "(SuffixAutomaton " << this->m_automaton << ")"; +} + +template < class SymbolType > +int SuffixAutomaton < SymbolType >::compare ( const SuffixAutomaton & other ) const { + auto first = ext::tie ( getAutomaton ( ) ); + auto second = ext::tie ( other.getAutomaton ( ) ); + + static ext::compare < decltype ( first ) > comp; + + return comp ( first, second ); +} + +template < class SymbolType > +SuffixAutomaton < SymbolType >::operator std::string ( ) const { + std::stringstream ss; + ss << * this; + return ss.str ( ); +} + +template < class SymbolType > +object::ObjectBase* SuffixAutomaton < SymbolType >::inc() && { + return new object::UniqueObject(object::Object(std::move(*this)), primitive::Integer(0)); +} + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace core { + +template < class SymbolType > +struct normalize < indexes::stringology::SuffixAutomaton < SymbolType >, typename std::enable_if < ! std::is_same < indexes::stringology::SuffixAutomaton < SymbolType >, indexes::stringology::SuffixAutomaton < > >::value >::type > { + static indexes::stringology::SuffixAutomaton < > eval ( indexes::stringology::SuffixAutomaton < SymbolType > && value ) { + ext::set < DefaultSymbolType > alphabet = alphabet::SymbolNormalize::normalizeAlphabet ( std::move ( value ).getAlphabet ( ) ); + ext::vector < DefaultSymbolType > string = alphabet::SymbolNormalize::normalizeSymbols ( std::move ( value ).getString ( ) ); + + return indexes::stringology::SuffixAutomaton < > ( std::move ( alphabet ), std::move ( value ).getData ( ), std::move ( string ) ); + } +}; + +template < class SymbolType > +struct xmlApi < indexes::stringology::SuffixAutomaton < SymbolType > > { + static indexes::stringology::SuffixAutomaton < SymbolType > parse ( ext::deque < sax::Token >::iterator & input ); + static bool first ( const ext::deque < sax::Token >::const_iterator & input ); + static const std::string & xmlTagName ( ); + static void compose ( ext::deque < sax::Token > & output, const indexes::stringology::SuffixAutomaton < SymbolType > & data ); +}; + +template < class SymbolType > +indexes::stringology::SuffixAutomaton < SymbolType > xmlApi < indexes::stringology::SuffixAutomaton < SymbolType > >::parse ( ext::deque < sax::Token >::iterator & input ) { + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); + automaton::DFA < SymbolType, unsigned > automaton = core::xmlApi < automaton::DFA < SymbolType, unsigned > >::parse ( input ); + indexes::stringology::SuffixAutomaton < SymbolType > res ( std::move ( automaton ) ); + + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::END_ELEMENT, xmlTagName ( ) ); + return res; +} + +template < class SymbolType > +bool xmlApi < indexes::stringology::SuffixAutomaton < SymbolType > >::first ( const ext::deque < sax::Token >::const_iterator & input ) { + return sax::FromXMLParserHelper::isToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); +} + +template < class SymbolType > +const std::string & xmlApi < indexes::stringology::SuffixAutomaton < SymbolType > >::xmlTagName ( ) { + static std::string xmlTagName = "SuffixAutomaton"; + + return xmlTagName; +} + +template < class SymbolType > +void xmlApi < indexes::stringology::SuffixAutomaton < SymbolType > >::compose ( ext::deque < sax::Token > & output, const indexes::stringology::SuffixAutomaton < SymbolType > & index ) { + output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::START_ELEMENT ); + core::xmlApi < automaton::DFA < SymbolType, unsigned > >::compose ( output, index.getAutomaton ( ) ); + output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::END_ELEMENT ); +} + +} /* namespace core */ + +#endif /* SUFFIX_AUTOMATON_H_ */ -- GitLab