diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1bfbaa363c9a57feeade893e1f35307d34ae1df3 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp @@ -0,0 +1,79 @@ +/* + * Author: Radovan Cerveny + */ + +#include "BackwardDAWGMatching.hpp" +#include "SuffixAutomaton.hpp" + +#include <exception/AlibException.h> +#include <string/LinearString.h> +#include <alphabet/Symbol.h> + +#include <algorithm> +#include <map> +#include <bitset> +#include <measure> + +namespace stringology { + +namespace exact { + +std::set < unsigned > BackwardDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + std::set < unsigned > occ; + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + + auto patternData = pattern.getContent ( ); + + reverse ( patternData.begin ( ), patternData.end ( ) ); + + const string::LinearString reversedPattern ( std::move ( patternData ) ); + + automaton::DFA suffixAutomaton = SuffixAutomaton::construct ( reversedPattern ); + + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + + const automaton::State failState = automaton::State ( -1 ); + + size_t posInSubject = 0; + + while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { + + automaton::State currentState = suffixAutomaton.getInitialState ( ); + + size_t posInPattern = reversedPattern.getContent ( ).size ( ); + + while ( posInPattern > 0 && currentState != failState ) { + auto transition = suffixAutomaton.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } ); + + if ( transition == suffixAutomaton.getTransitions ( ).end ( ) ) + currentState = failState; + else + currentState = transition->second; + + posInPattern--; + } + + if ( currentState != failState ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + + posInSubject += posInPattern + 1; + } + + measurements::end ( ); + + return occ; +} + +auto BackwardDAWGMatchingLinearStringLinearString = BackwardDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardDAWGMatching::getInstance ( ), BackwardDAWGMatching::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b31431784a559e4365c01a522e08f54cf08dc386 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp @@ -0,0 +1,43 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef STRINGOLOGY_BACKWARD_DAWG_MATCHING_HPP__ +#define STRINGOLOGY_BACKWARD_DAWG_MATCHING_HPP__ + +#include <string/String.h> +#include <string/StringFeatures.h> +#include <core/multipleDispatch.hpp> + +#include <set> + +namespace stringology { + +namespace exact { + +/** + * Implementation of Backward DAWG Matching. + */ +class BackwardDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { +private: +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + + static BackwardDAWGMatching & getInstance ( ) { + static BackwardDAWGMatching res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* STRINGOLOGY_BACKWARD_DAWG_MATCHING_HPP__ */ diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4cea5124cd8109ee436bf673751e6a1c990d1c9 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp @@ -0,0 +1,131 @@ +/* + * Author: Radovan Cerveny + */ + +#include "BackwardNondeterministicDAWGMatching.hpp" + +#include <exception/AlibException.h> +#include <string/LinearString.h> +#include <alphabet/Symbol.h> + +#include <map> +#include <bitset> +#include <measure> + +namespace stringology { + +namespace exact { + +template < size_t BitmaskBitCount > +std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +template < size_t BitmaskBitCount > +std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ) { + std::set < unsigned > occ; + + // Setup helper variables + using BitmaskType = std::bitset < BitmaskBitCount >; + bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( ); + size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( ); + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + + std::map < alphabet::Symbol, BitmaskType > symbolBitmaskLookupTable; + + // Initialize the bitmasks with zeros for each symbol in the alphabet + for ( const auto & symbol : pattern.getAlphabet ( ) ) + symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 ); + + // Mark the position in the bitmask for each symbol in the pattern + for ( size_t i = 0; i < bitmaskLength; i++ ) + symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 ); + + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + + size_t posInSubject = 0; + BitmaskType currentBitmask; + + while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { + size_t posInPattern = bitmaskLength; + size_t lastPosOfFactor = bitmaskLength; + + // Set the bitmask to all ones + currentBitmask.set ( ); + + while ( posInPattern > 0 && currentBitmask.any ( ) ) { + currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )]; + posInPattern--; + + // Test whether the most significant bit is set + if ( currentBitmask.test ( bitmaskLength - 1 ) ) { + if ( posInPattern > 0 ) { + lastPosOfFactor = posInPattern; + } else { + if ( !patternIsLong ) { + // Yay, there is match!!! + occ.insert ( posInSubject ); + } else { + // if the pattern is longer then BitmaskBitCount characters switch to brute force check + size_t k = bitmaskLength; + + while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; + + if ( k == pattern.getContent ( ).size ( ) ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + } + } + } + + currentBitmask <<= 1; + } + + posInSubject += lastPosOfFactor; + } + + measurements::end ( ); + + return occ; +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); +} + +auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::getInstance ( ), BackwardNondeterministicDAWGMatching::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6ebe51d72a1fe0a41e90ddb996df5742185fe046 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp @@ -0,0 +1,60 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__ +#define STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__ + +#include <string/String.h> +#include <string/StringFeatures.h> +#include <core/multipleDispatch.hpp> + +#include <set> + +namespace stringology { + +namespace exact { + +/** + * Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 32/64/128bit bitmask and brute force switch for longer patterns. + */ +class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { +private: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + template <size_t BitmaskBitCount> + static std::set < unsigned > matchTemplate ( const string::String & subject, const string::String & pattern ); + + template <size_t BitmaskBitCount > + static std::set < unsigned > matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ); +public: + + // Defaults to 32 bits + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + + static std::set < unsigned > match32 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match32 ( const string::LinearString & subject, const string::LinearString & pattern ); + + static std::set < unsigned > match64 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match64 ( const string::LinearString & subject, const string::LinearString & pattern ); + + static std::set < unsigned > match128 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match128 ( const string::LinearString & subject, const string::LinearString & pattern ); + + + static BackwardNondeterministicDAWGMatching & getInstance ( ) { + static BackwardNondeterministicDAWGMatching res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__ */ diff --git a/alib2algo/src/stringology/exact/BackwardOracleMatching.cpp b/alib2algo/src/stringology/exact/BackwardOracleMatching.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41ca21fcc1224eafdf98942361e5853f06d720c6 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardOracleMatching.cpp @@ -0,0 +1,79 @@ +/* + * Author: Radovan Cerveny + */ + +#include "BackwardOracleMatching.hpp" +#include "FactorOracleAutomaton.hpp" + +#include <exception/AlibException.h> +#include <string/LinearString.h> +#include <alphabet/Symbol.h> + +#include <algorithm> +#include <map> +#include <bitset> +#include <measure> + +namespace stringology { + +namespace exact { + +std::set < unsigned > BackwardOracleMatching::match ( const string::String & subject, const string::String & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > BackwardOracleMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + std::set < unsigned > occ; + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + + auto patternData = pattern.getContent ( ); + + reverse ( patternData.begin ( ), patternData.end ( ) ); + + const string::LinearString reversedPattern ( std::move ( patternData ) ); + + automaton::DFA factorOracle = FactorOracleAutomaton::construct ( reversedPattern ); + + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + + const automaton::State failState = automaton::State ( -1 ); + + size_t posInSubject = 0; + + while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { + + automaton::State currentState = factorOracle.getInitialState ( ); + + size_t posInPattern = reversedPattern.getContent ( ).size ( ); + + while ( posInPattern > 0 && currentState != failState ) { + auto transition = factorOracle.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } ); + + if ( transition == factorOracle.getTransitions ( ).end ( ) ) + currentState = failState; + else + currentState = transition->second; + + posInPattern--; + } + + if ( currentState != failState ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + + posInSubject += posInPattern + 1; + } + + measurements::end ( ); + + return occ; +} + +auto BackwardOracleMatchingLinearStringLinearString = BackwardOracleMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardOracleMatching::getInstance ( ), BackwardOracleMatching::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/BackwardOracleMatching.hpp b/alib2algo/src/stringology/exact/BackwardOracleMatching.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9bc1cf873b1503c84625c74903035be49da33986 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardOracleMatching.hpp @@ -0,0 +1,43 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ +#define STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ + +#include <string/String.h> +#include <string/StringFeatures.h> +#include <core/multipleDispatch.hpp> + +#include <set> + +namespace stringology { + +namespace exact { + +/** + * Implementation of Backward Oracle Matching. + */ +class BackwardOracleMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { +private: +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + + static BackwardOracleMatching & getInstance ( ) { + static BackwardOracleMatching res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ */ diff --git a/alib2algo/src/stringology/exact/FactorOracleAutomaton.cpp b/alib2algo/src/stringology/exact/FactorOracleAutomaton.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb5e031909d19aa8d0c066c19a20ec147c12d4a6 --- /dev/null +++ b/alib2algo/src/stringology/exact/FactorOracleAutomaton.cpp @@ -0,0 +1,59 @@ +/* + * Author: Radovan Cerveny + */ + +#include "FactorOracleAutomaton.hpp" +#include <exception/AlibException.h> +#include <string/LinearString.h> + +namespace stringology { + +namespace exact { + +automaton::Automaton FactorOracleAutomaton::construct ( const string::String & pattern ) { + return getInstance ( ).dispatch ( pattern.getData ( ) ); +} + +automaton::DFA FactorOracleAutomaton::construct ( const string::LinearString & pattern ) { + automaton::DFA oracle ( automaton::State ( 0 ) ); + + std::map < automaton::State, automaton::State > supplyFunction { { automaton::State ( 0 ), automaton::State ( -1 ) } }; + + oracle.setInputAlphabet ( pattern.getAlphabet ( ) ); + + for ( const alphabet::Symbol & symbol : pattern.getContent ( ) ) + oracleAddLetter ( oracle, symbol, supplyFunction ); + + return oracle; +} + +void FactorOracleAutomaton::oracleAddLetter ( automaton::DFA & oracle, const alphabet::Symbol & symbol, std::map < automaton::State, automaton::State > & supplyFunction ) { + int m = ( int ) oracle.getStates ( ).size ( ) - 1; + + automaton::State lastState ( m ); + automaton::State newState ( m + 1 ); + + oracle.addState ( newState ); + oracle.addFinalState ( newState ); + + oracle.addTransition ( lastState, symbol, newState ); + automaton::State kState = supplyFunction.find( lastState ) -> second; + + while ( kState != automaton::State ( -1 ) && oracle.getTransitions ( ).find ( { kState, symbol } ) == oracle.getTransitions ( ).end ( ) ) { + oracle.addTransition ( kState, symbol, newState ); + kState = supplyFunction.find( kState ) -> second; + } + + automaton::State supplyState = automaton::State ( 0 ); + + if ( kState != automaton::State ( -1 ) ) + supplyState = oracle.getTransitions ( ).find( { kState, symbol } ) -> second; + + supplyFunction.insert( { newState, supplyState } ); +} + +auto FactorOracleAutomatonLinearString = FactorOracleAutomaton::RegistratorWrapper < automaton::DFA, string::LinearString > ( FactorOracleAutomaton::getInstance ( ), FactorOracleAutomaton::construct ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/FactorOracleAutomaton.hpp b/alib2algo/src/stringology/exact/FactorOracleAutomaton.hpp new file mode 100644 index 0000000000000000000000000000000000000000..043eab4a78797ea9d763104dfee0888701cbbe94 --- /dev/null +++ b/alib2algo/src/stringology/exact/FactorOracleAutomaton.hpp @@ -0,0 +1,43 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef FACTOR_ORACLE_AUTOMATON_HPP__ +#define FACTOR_ORACLE_AUTOMATON_HPP__ + +#include <automaton/Automaton.h> +#include <automaton/FSM/DFA.h> +#include <string/LinearString.h> +#include <string/String.h> +#include <core/multipleDispatch.hpp> + +namespace stringology { + +namespace exact { + +class FactorOracleAutomaton : public std::SingleDispatch < automaton::Automaton, string::StringBase > { +private: + static void oracleAddLetter ( automaton::DFA & oracle, const alphabet::Symbol & symbol, std::map < automaton::State, automaton::State > & supplyFunction ); + +public: + /** + * Constructs factor oracle automaton for given pattern. + * @return factor oracle automaton for given pattern + */ + static automaton::Automaton construct ( const string::String & pattern ); + + static automaton::DFA construct ( const string::LinearString & pattern ); + + static FactorOracleAutomaton & getInstance ( ) { + static FactorOracleAutomaton res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* FACTOR_ORACLE_AUTOMATON_HPP__ */ diff --git a/alib2algo/src/stringology/exact/SuffixAutomaton.cpp b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp new file mode 100644 index 0000000000000000000000000000000000000000..190e23d41af434ec22fce58855209d141a868dd7 --- /dev/null +++ b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp @@ -0,0 +1,130 @@ +/* + * Author: Radovan Cerveny + */ + +#include "SuffixAutomaton.hpp" +#include <exception/AlibException.h> +#include <string/Epsilon.h> +#include <label/LabelSetLabel.h> +#include "../../automaton/determinize/Determinize.h" +#include "../../automaton/simplify/Minimize.h" +#include "../../automaton/simplify/EpsilonRemoverIncoming.h" + +namespace stringology { + +namespace exact { + +automaton::Automaton SuffixAutomaton::naiveConstruct ( const string::String & pattern ) { + return getInstance ( ).dispatch ( pattern.getData ( ) ); +} + +automaton::DFA SuffixAutomaton::naiveConstruct ( const string::LinearString & pattern ) { + automaton::EpsilonNFA nfaSuffixAutomaton ( automaton::State ( 0 ) ); + + nfaSuffixAutomaton.setInputAlphabet ( pattern.getAlphabet ( ) ); + + int i = 0; + + for ( const alphabet::Symbol & symbol : pattern.getContent ( ) ) { + i++; + nfaSuffixAutomaton.addState ( automaton::State ( i ) ); + nfaSuffixAutomaton.addTransition ( automaton::State ( i - 1 ), symbol, automaton::State ( i ) ); + nfaSuffixAutomaton.addTransition ( automaton::State ( 0 ), automaton::State ( i ) ); + } + + nfaSuffixAutomaton.addFinalState ( automaton::State ( i ) ); + + automaton::DFA minimalSuffixAutomaton = automaton::simplify::Minimize::minimize ( automaton::determinize::Determinize::determinize ( automaton::simplify::EpsilonRemoverIncoming::remove ( nfaSuffixAutomaton ) ) ); + + automaton::State failState = automaton::State ( label::Label ( label::LabelSetLabel ( { } ) ) ); + + auto transitionsToFailState = minimalSuffixAutomaton.getTransitionsToState ( failState ); + + for ( const auto & transition : transitionsToFailState ) + minimalSuffixAutomaton.removeTransition ( transition.first.first, transition.first.second, transition.second ); + + minimalSuffixAutomaton.removeState ( failState ); + + return minimalSuffixAutomaton; +} + +automaton::Automaton SuffixAutomaton::construct ( const string::String & pattern ) { + return getInstance ( ).dispatch ( pattern.getData ( ) ); +} + +automaton::DFA SuffixAutomaton::construct ( const string::LinearString & pattern ) { + automaton::DFA suffixAutomaton ( automaton::State ( 0 ) ); + + suffixAutomaton.setInputAlphabet ( pattern.getAlphabet ( ) ); + + std::map < automaton::State, std::pair < automaton::State, int > > suffixLinks = { { automaton::State ( 0 ), { automaton::State ( -1 ), 0 } } }; + automaton::State lastState ( 0 ); + + for ( const alphabet::Symbol & symbol : pattern.getContent ( ) ) + suffixAutomatonAddSymbol ( suffixAutomaton, symbol, suffixLinks, lastState ); + + while ( lastState != automaton::State ( -1 ) ) { + suffixAutomaton.addFinalState ( lastState ); + lastState = suffixLinks.find ( lastState )->second.first; + } + + return suffixAutomaton; +} + +void SuffixAutomaton::suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomaton, const alphabet::Symbol & symbol, std::map < automaton::State, std::pair < automaton::State, int > > & suffixLinks, automaton::State & lastState ) { + + automaton::State newState ( ( int ) suffixAutomaton.getStates ( ).size ( ) ); + + suffixAutomaton.addState ( newState ); + + int lastSuffixLength = suffixLinks.find ( lastState )->second.second; + + suffixLinks.insert ( { newState, { automaton::State ( -1 ), lastSuffixLength + 1 } } ); + + automaton::State kState = lastState; + + while ( kState != automaton::State ( -1 ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) == suffixAutomaton.getTransitions ( ).end ( ) ) { + suffixAutomaton.addTransition ( kState, symbol, newState ); + kState = suffixLinks.find ( kState )->second.first; + } + + if ( kState == automaton::State ( -1 ) ) { + suffixLinks.find ( newState )->second.first = automaton::State ( 0 ); + } else { + automaton::State qState = suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second; + + int kSuffixLength = suffixLinks.find ( kState )->second.second; + int qSuffixLength = suffixLinks.find ( qState )->second.second; + + if ( kSuffixLength + 1 == qSuffixLength ) { + suffixLinks.find ( newState )->second.first = qState; + } else { + + automaton::State cloneState ( ( int ) suffixAutomaton.getStates ( ).size ( ) ); + suffixAutomaton.addState ( cloneState ); + + suffixLinks.insert ( { cloneState, { suffixLinks.find ( qState )->second.first, kSuffixLength + 1 } } ); + + for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( qState ) ) + suffixAutomaton.addTransition ( cloneState, transition.first.second, transition.second ); + + while ( kState != automaton::State ( -1 ) + && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( ) + && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) { + suffixAutomaton.removeTransition ( kState, symbol, qState ); + suffixAutomaton.addTransition ( kState, symbol, cloneState ); + kState = suffixLinks.find ( kState )->second.first; + } + + suffixLinks.find ( qState )->second.first = cloneState; + suffixLinks.find ( newState )->second.first = cloneState; + } + } + lastState = newState; +} + +auto SuffixAutomatonLinearString = SuffixAutomaton::RegistratorWrapper < automaton::DFA, string::LinearString > ( SuffixAutomaton::getInstance ( ), SuffixAutomaton::construct ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/SuffixAutomaton.hpp b/alib2algo/src/stringology/exact/SuffixAutomaton.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ca033a5135d0246531186d33cb978701e4caa7b3 --- /dev/null +++ b/alib2algo/src/stringology/exact/SuffixAutomaton.hpp @@ -0,0 +1,52 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef SUFFIX_AUTOMATON_HPP_ +#define SUFFIX_AUTOMATON_HPP_ + +#include <automaton/Automaton.h> +#include <automaton/FSM/DFA.h> +#include <automaton/FSM/EpsilonNFA.h> +#include <string/LinearString.h> +#include <string/String.h> +#include <core/multipleDispatch.hpp> + +namespace stringology { + +namespace exact { + +class SuffixAutomaton : public std::SingleDispatch < automaton::Automaton, string::StringBase > { +private: + static void suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomaton, const alphabet::Symbol & symbol, std::map < automaton::State, std::pair<automaton::State, int > > & suffixLinks, automaton::State & lastState ); + +public: + /** + * Naive construction of minimal suffix automaton for given pattern - EpsNFA -> NFA -> DFA -> minDFA -> removeErrorState. + * @return minimal suffix automaton for given pattern. + */ + static automaton::Automaton naiveConstruct ( const string::String & pattern ); + + static automaton::DFA naiveConstruct ( const string::LinearString & pattern ); + + /** + * Linear time on-line construction of minimal suffix automaton for given pattern. + * @return minimal suffix automaton for given pattern. + */ + static automaton::Automaton construct ( const string::String & pattern ); + + static automaton::DFA construct ( const string::LinearString & pattern ); + + static SuffixAutomaton & getInstance ( ) { + static SuffixAutomaton res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* SUFFIX_AUTOMATON_HPP_ */ diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4059bf1f7b8c3d3ff325720331c1a2ae0f6dd398 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp @@ -0,0 +1,56 @@ +#include "BackwardNondeterministicDAWGMatchingTest.h" + +#include "string/String.h" +#include "stringology/exact/BackwardNondeterministicDAWGMatching.hpp" + +#include "string/generate/RandomStringFactory.h" +#include "string/generate/RandomSubstringFactory.h" + +#define CPPUNIT_IMPLY( x, y ) CPPUNIT_ASSERT ( !( x ) || ( y ) ) + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest ); + +void BackwardNondeterministicDAWGMatchingTest::setUp ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::tearDown ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { + + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + + for(size_t i = 0; i < subjects.size(); ++i) { + string::String subject = string::stringFrom ( subjects[i] ); + string::String pattern = string::stringFrom ( patterns[i] ); + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + res = stringology::exact::BackwardNondeterministicDAWGMatching::match64 ( subject, pattern ); + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + res = stringology::exact::BackwardNondeterministicDAWGMatching::match128 ( subject, pattern ); + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res.size() > 0 ); + +} diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h new file mode 100644 index 0000000000000000000000000000000000000000..016be2e42a20aca90297cf6438516c885f9bd402 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h @@ -0,0 +1,18 @@ +#ifndef BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ +#define BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class BackwardNondeterministicDAWGMatchingTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE ( BackwardNondeterministicDAWGMatchingTest ); + CPPUNIT_TEST ( testBNDM ); + CPPUNIT_TEST_SUITE_END ( ); + +public: + void setUp ( ); + void tearDown ( ); + + void testBNDM ( ); +}; + +#endif // BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ diff --git a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa23d8bf8e673dfe736c8470b3cc367778d888fc --- /dev/null +++ b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp @@ -0,0 +1,81 @@ +#include "FactorOracleAutomatonTest.h" + +#include "string/LinearString.h" +#include "stringology/exact/FactorOracleAutomaton.hpp" +#include "stringology/exact/BackwardOracleMatching.hpp" + +#include "string/generate/RandomStringFactory.h" +#include "string/generate/RandomSubstringFactory.h" + +#define CPPUNIT_IMPLY( x, y ) CPPUNIT_ASSERT ( !( x ) || ( y ) ) + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( FactorOracleAutomatonTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( FactorOracleAutomatonTest ); + +void FactorOracleAutomatonTest::setUp ( ) { +} + +void FactorOracleAutomatonTest::tearDown ( ) { +} + +void FactorOracleAutomatonTest::testFactorOracleConstruction ( ) { + + string::LinearString pattern ( "atatac" ); + + automaton::DFA oracle = stringology::exact::FactorOracleAutomaton::construct ( pattern ); + + automaton::DFA refOracle ( automaton::State ( 0 ) ); + + refOracle.setInputAlphabet ( pattern.getAlphabet ( ) ); + + for ( int i = 1; i <= 6; ++i ) { + refOracle.addState ( automaton::State ( i ) ); + refOracle.addFinalState ( automaton::State ( i ) ); + } + + refOracle.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 1 ) ); + refOracle.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 't' ), automaton::State ( 2 ) ); + refOracle.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refOracle.addTransition ( automaton::State ( 1 ), alphabet::symbolFrom ( 't' ), automaton::State ( 2 ) ); + refOracle.addTransition ( automaton::State ( 1 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refOracle.addTransition ( automaton::State ( 2 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 3 ) ); + refOracle.addTransition ( automaton::State ( 3 ), alphabet::symbolFrom ( 't' ), automaton::State ( 4 ) ); + refOracle.addTransition ( automaton::State ( 3 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refOracle.addTransition ( automaton::State ( 4 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 5 ) ); + refOracle.addTransition ( automaton::State ( 5 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + + CPPUNIT_ASSERT ( oracle == refOracle ); +} + +void FactorOracleAutomatonTest::testBackwardOracleMatching ( ) { + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + + for(size_t i = 0; i < subjects.size(); ++i) { + string::String subject = string::stringFrom ( subjects[i] ); + string::String pattern = string::stringFrom ( patterns[i] ); + std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( subject, pattern ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject ); + std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res.size() > 0 ); + +} diff --git a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.h b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.h new file mode 100644 index 0000000000000000000000000000000000000000..b3b4bb947fe0819897488c23b459e430449af3be --- /dev/null +++ b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.h @@ -0,0 +1,20 @@ +#ifndef FACTOR_ORACLE_AUTOMATON_TEST_HPP_ +#define FACTOR_ORACLE_AUTOMATON_TEST_HPP_ + +#include <cppunit/extensions/HelperMacros.h> + +class FactorOracleAutomatonTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE ( FactorOracleAutomatonTest ); + CPPUNIT_TEST ( testFactorOracleConstruction ); + CPPUNIT_TEST ( testBackwardOracleMatching ); + CPPUNIT_TEST_SUITE_END ( ); + +public: + void setUp ( ); + void tearDown ( ); + + void testFactorOracleConstruction ( ); + void testBackwardOracleMatching ( ); +}; + +#endif // FACTOR_ORACLE_AUTOMATON_TEST_HPP_ diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bee259d485a53c7d043efa24ba968400637df461 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp @@ -0,0 +1,84 @@ +#include "SuffixAutomatonTest.h" + +#include "string/LinearString.h" +#include "stringology/exact/SuffixAutomaton.hpp" +#include "stringology/exact/BackwardDAWGMatching.hpp" + +#include "string/generate/RandomStringFactory.h" +#include "string/generate/RandomSubstringFactory.h" + +#define CPPUNIT_IMPLY( x, y ) CPPUNIT_ASSERT ( !( x ) || ( y ) ) + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( SuffixAutomatonTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( SuffixAutomatonTest ); + +void SuffixAutomatonTest::setUp ( ) { +} + +void SuffixAutomatonTest::tearDown ( ) { +} + +void SuffixAutomatonTest::testSuffixAutomatonConstruction ( ) { + + string::LinearString pattern ( "atatac" ); + + automaton::DFA suffixAutomaton = stringology::exact::SuffixAutomaton::construct ( pattern ); + + automaton::DFA refSuffixAutomaton ( automaton::State ( 0 ) ); + + refSuffixAutomaton.setInputAlphabet ( pattern.getAlphabet ( ) ); + + for ( int i = 1; i <= 6; ++i ) { + refSuffixAutomaton.addState ( automaton::State ( i ) ); + } + + refSuffixAutomaton.addFinalState ( automaton::State ( 0 ) ); + refSuffixAutomaton.addFinalState ( automaton::State ( 6 ) ); + + refSuffixAutomaton.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 1 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 't' ), automaton::State ( 2 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 0 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 1 ), alphabet::symbolFrom ( 't' ), automaton::State ( 2 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 1 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 2 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 3 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 3 ), alphabet::symbolFrom ( 't' ), automaton::State ( 4 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 3 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 4 ), alphabet::symbolFrom ( 'a' ), automaton::State ( 5 ) ); + refSuffixAutomaton.addTransition ( automaton::State ( 5 ), alphabet::symbolFrom ( 'c' ), automaton::State ( 6 ) ); + + CPPUNIT_ASSERT ( suffixAutomaton == refSuffixAutomaton ); +} + +void SuffixAutomatonTest::testBackwardDAWGMatching ( ) { + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + + for(size_t i = 0; i < subjects.size(); ++i) { + string::String subject = string::stringFrom ( subjects[i] ); + string::String pattern = string::stringFrom ( patterns[i] ); + std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( subject, pattern ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject ); + std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res.size() > 0 ); + +} + diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h new file mode 100644 index 0000000000000000000000000000000000000000..2154792b1d41231713d550cd40625914be44a553 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h @@ -0,0 +1,21 @@ +#ifndef SUFFIX_AUTOMATON_TEST_HPP_ +#define SUFFIX_AUTOMATON_TEST_HPP_ + +#include <cppunit/extensions/HelperMacros.h> + +class SuffixAutomatonTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE ( SuffixAutomatonTest ); + CPPUNIT_TEST ( testSuffixAutomatonConstruction ); + CPPUNIT_TEST ( testBackwardDAWGMatching ); + CPPUNIT_TEST_SUITE_END ( ); + +public: + void setUp ( ); + void tearDown ( ); + + void testSuffixAutomatonConstruction ( ); + void testBackwardDAWGMatching ( ); + +}; + +#endif // SUFFIX_AUTOMATON_TEST_HPP_ diff --git a/alib2measurepp/src/processor/MeasurementProcessor.cpp b/alib2measurepp/src/processor/MeasurementProcessor.cpp index 6e8343b2994e26714ee02ce5d5fae4f42b122efa..bdb321bff5f9c6294669cc8d38fb0b46cc09dc9b 100644 --- a/alib2measurepp/src/processor/MeasurementProcessor.cpp +++ b/alib2measurepp/src/processor/MeasurementProcessor.cpp @@ -7,20 +7,20 @@ namespace measurements { -MeasurementProvisionerResults MeasurementProcessor::process ( const MeasurementProvisionerResults & mpr, const MeasurementProcessorFilter & filter ) { +MeasurementProvisionerResults MeasurementProcessor::process ( const MeasurementProvisionerResults& mpr, const MeasurementProcessorFilter& filter ) { MeasurementProvisionerResults newmpr; - for ( const MPRInputResult & mprir : mpr.inputResults ) { + for ( const MPRInputResult& mprir : mpr.inputResults ) { MPRInputResult newmprir; newmprir.inputs = mprir.inputs; - for ( const MPRPipelineResult & mprpr : mprir.pipelineResults ) { + for ( const MPRPipelineResult& mprpr : mprir.pipelineResults ) { MPRPipelineResult newmprpr; newmprpr.pipelineStatus = mprpr.pipelineStatus; - for ( const MPRPipelineCommandResult & mprpcr : mprpr.commandResults ) { + for ( const MPRPipelineCommandResult& mprpcr : mprpr.commandResults ) { - // filter out commands + // filter out commands if ( !filter.matches ( MeasurementProcessorFilter::FilterType::COMMAND, mprpcr.command ) ) continue; MPRPipelineCommandResult newmprpcr; @@ -28,7 +28,8 @@ MeasurementProvisionerResults MeasurementProcessor::process ( const MeasurementP newmprpcr.measurementResults = processMeasurementResults ( mprpcr.measurementResults, filter ); - newmprpr.commandResults.push_back ( std::move ( newmprpcr ) ); + if ( newmprpcr.measurementResults.frames.size ( ) > 1 ) + newmprpr.commandResults.push_back ( std::move ( newmprpcr ) ); } if ( newmprpr.commandResults.size ( ) ) @@ -42,12 +43,12 @@ MeasurementProvisionerResults MeasurementProcessor::process ( const MeasurementP return newmpr; } -MeasurementResults MeasurementProcessor::processMeasurementResults ( const MeasurementResults & mr, const MeasurementProcessorFilter & filter ) { +MeasurementResults MeasurementProcessor::processMeasurementResults ( const MeasurementResults& mr, const MeasurementProcessorFilter& filter ) { MeasurementResults newmr; - std::function < void ( unsigned, unsigned ) > dfsLambda; + std::function< void(unsigned, unsigned)> dfsLambda; - dfsLambda = [&] ( unsigned idx, unsigned newmrParentIdx ) { + dfsLambda = [&]( unsigned idx, unsigned newmrParentIdx ) { if ( idx == 0 ) { newmr.frames.push_back ( mr.frames[0] ); newmr.frames[0].subIdxs.clear ( ); @@ -81,7 +82,7 @@ MeasurementResults MeasurementProcessor::processMeasurementResults ( const Measu return newmr; } -std::string MeasurementProcessor::output ( const MeasurementProvisionerResults & mpr, const MeasurementProcessorOutput & output ) { +std::string MeasurementProcessor::output ( const MeasurementProvisionerResults& mpr, const MeasurementProcessorOutput& output ) { std::stringstream ss; MeasurementProcessor::output ( ss, mpr, output ); @@ -89,7 +90,7 @@ std::string MeasurementProcessor::output ( const MeasurementProvisionerResults & return ss.str ( ); } -void MeasurementProcessor::output ( std::ostream & os, const MeasurementProvisionerResults & mpr, const MeasurementProcessorOutput & output ) { +void MeasurementProcessor::output ( std::ostream& os, const MeasurementProvisionerResults& mpr, const MeasurementProcessorOutput& output ) { switch ( output.outputFormat ) { case MeasurementProcessorOutput::OutputFormat::XML: MeasurementProcessorOutput::outputXml ( os, mpr, output ); @@ -108,5 +109,4 @@ void MeasurementProcessor::output ( std::ostream & os, const MeasurementProvisio break; } } - } diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp index 42e9daa01cfdfb214a072522564d73aa5c4f769e..1c6abe9209e547a876ada420bd07b3556b8a5f17 100644 --- a/astringology2/src/astringology.cpp +++ b/astringology2/src/astringology.cpp @@ -23,11 +23,16 @@ #include <stringology/exact/BoyerMooreHorspool.h> #include <stringology/exact/ReversedBoyerMooreHorspool.h> #include <stringology/exact/DeadZoneUsingBadCharacterShift.h> +#include <stringology/exact/BackwardNondeterministicDAWGMatching.hpp> +#include <stringology/exact/BackwardOracleMatching.hpp> +#include <stringology/exact/BackwardDAWGMatching.hpp> #include <stringology/exact/ExactMatchingAutomaton.h> #include <stringology/exact/ExactFactorAutomaton.h> #include <stringology/exact/ExactSubsequenceAutomaton.h> #include <stringology/exact/ExactNondeterministicSubsequenceAutomaton.h> #include <stringology/exact/ExactMultiNondeterministicSubsequenceAutomaton.h> +#include <stringology/exact/FactorOracleAutomaton.hpp> +#include <stringology/exact/SuffixAutomaton.hpp> #include <stringology/exact/BorderArray.h> #include <stringology/indexing/SuffixTrie.h> @@ -41,10 +46,15 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "exactSubsequenceAutomaton" ); allowed.push_back ( "exactNondeterministicSubsequenceAutomaton" ); allowed.push_back ( "exactMultiNondeterministicSubsequenceAutomaton" ); + allowed.push_back ( "factorOracleAutomaton" ); + allowed.push_back ( "suffixAutomaton" ); allowed.push_back ( "exactFactorMatch" ); allowed.push_back ( "boyerMooreHorspool" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); allowed.push_back ( "deadZoneUsingBadCharacterShift" ); + allowed.push_back ( "backwardNondeterministicDAWGMatching" ); + allowed.push_back ( "backwardOracleMatching" ); + allowed.push_back ( "backwardDAWGMatching" ); allowed.push_back ( "borderArray" ); allowed.push_back ( "suffixTrie" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); @@ -125,6 +135,45 @@ int main ( int argc, char * argv[] ) { measurements::end ( ); measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "backwardNondeterministicDAWGMatching" ) { + string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "backwardOracleMatching" ) { + string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( subject, pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "backwardDAWGMatching" ) { + string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( subject, pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); } else if ( algorithm.getValue ( ) == "exactMatchingAutomaton" ) { string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); @@ -188,6 +237,30 @@ int main ( int argc, char * argv[] ) { measurements::end ( ); measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( automaton ); + } else if ( algorithm.getValue ( ) == "factorOracleAutomaton" ) { + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + automaton::Automaton automaton = stringology::exact::FactorOracleAutomaton::construct ( pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + + alib::XmlDataFactory::toStdout ( automaton ); + } else if ( algorithm.getValue ( ) == "suffixAutomaton" ) { + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + automaton::Automaton automaton = stringology::exact::SuffixAutomaton::construct ( pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( automaton ); } else if ( algorithm.getValue ( ) == "borderArray" ) { string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); diff --git a/examples2/measurements/ameasure.xml b/examples2/measurements/ameasure.xml index ce91fe20147b2d3e980357e684341c131f93c6e7..214512ab5d8df23e27383f61e63f46c61ad160d6 100644 --- a/examples2/measurements/ameasure.xml +++ b/examples2/measurements/ameasure.xml @@ -1,7 +1,7 @@ <?xml version="1.0"?> <MeasurementProvisioner> <Environment> - <PipelineIterations>3</PipelineIterations> + <PipelineIterations>10</PipelineIterations> </Environment> <InputData> <InputBatch> @@ -28,5 +28,14 @@ <Pipeline> <Command alias="deadzone" >./astringology2 -a deadZoneUsingBadCharacterShift -s $1 -p $2</Command> </Pipeline> + <Pipeline> + <Command alias="bndm" >./astringology2 -a backwardNondeterministicDAWGMatching -s $1 -p $2</Command> + </Pipeline> + <Pipeline> + <Command alias="bdm" >./astringology2 -a backwardDAWGMatching -s $1 -p $2</Command> + </Pipeline> + <Pipeline> + <Command alias="bom" >./astringology2 -a backwardOracleMatching -s $1 -p $2</Command> + </Pipeline> </Pipelines> </MeasurementProvisioner> \ No newline at end of file