From e41c9c0c9c3108a7ca7807c6c50fb9d0c82ce166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com> Date: Sat, 2 Apr 2016 15:30:54 +0200 Subject: [PATCH] suffix automaton construction fix, backward dawg matching implemented --- .../exact/BackwardDAWGMatching.cpp | 79 +++++++++++++++++++ .../exact/BackwardDAWGMatching.hpp | 43 ++++++++++ .../src/stringology/exact/SuffixAutomaton.cpp | 9 ++- .../exact/FactorOracleAutomatonTest.cpp | 4 +- .../stringology/exact/SuffixAutomatonTest.cpp | 35 ++++++++ .../stringology/exact/SuffixAutomatonTest.h | 3 + 6 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp create mode 100644 alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp new file mode 100644 index 0000000000..1bfbaa363c --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp @@ -0,0 +1,79 @@ +/* + * Author: Radovan Cerveny + */ + +#include "BackwardDAWGMatching.hpp" +#include "SuffixAutomaton.hpp" + +#include <exception/AlibException.h> +#include <string/LinearString.h> +#include <alphabet/Symbol.h> + +#include <algorithm> +#include <map> +#include <bitset> +#include <measure> + +namespace stringology { + +namespace exact { + +std::set < unsigned > BackwardDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + std::set < unsigned > occ; + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + + auto patternData = pattern.getContent ( ); + + reverse ( patternData.begin ( ), patternData.end ( ) ); + + const string::LinearString reversedPattern ( std::move ( patternData ) ); + + automaton::DFA suffixAutomaton = SuffixAutomaton::construct ( reversedPattern ); + + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + + const automaton::State failState = automaton::State ( -1 ); + + size_t posInSubject = 0; + + while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { + + automaton::State currentState = suffixAutomaton.getInitialState ( ); + + size_t posInPattern = reversedPattern.getContent ( ).size ( ); + + while ( posInPattern > 0 && currentState != failState ) { + auto transition = suffixAutomaton.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } ); + + if ( transition == suffixAutomaton.getTransitions ( ).end ( ) ) + currentState = failState; + else + currentState = transition->second; + + posInPattern--; + } + + if ( currentState != failState ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + + posInSubject += posInPattern + 1; + } + + measurements::end ( ); + + return occ; +} + +auto BackwardDAWGMatchingLinearStringLinearString = BackwardDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardDAWGMatching::getInstance ( ), BackwardDAWGMatching::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp new file mode 100644 index 0000000000..382fa52c9f --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp @@ -0,0 +1,43 @@ +/* + * Author: Radovan Cerveny + */ + +#ifndef STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ +#define STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ + +#include <string/String.h> +#include <string/StringFeatures.h> +#include <core/multipleDispatch.hpp> + +#include <set> + +namespace stringology { + +namespace exact { + +/** + * Implementation of Backward DAWG Matching. + */ +class BackwardDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { +private: +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + + static BackwardDAWGMatching & getInstance ( ) { + static BackwardDAWGMatching res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ */ diff --git a/alib2algo/src/stringology/exact/SuffixAutomaton.cpp b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp index 2ea1125669..190e23d41a 100644 --- a/alib2algo/src/stringology/exact/SuffixAutomaton.cpp +++ b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp @@ -108,13 +108,18 @@ void SuffixAutomaton::suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomato for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( qState ) ) suffixAutomaton.addTransition ( cloneState, transition.first.second, transition.second ); - while ( kState != automaton::State ( -1 ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) { + while ( kState != automaton::State ( -1 ) + && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( ) + && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) { + suffixAutomaton.removeTransition ( kState, symbol, qState ); suffixAutomaton.addTransition ( kState, symbol, cloneState ); kState = suffixLinks.find ( kState )->second.first; } + + suffixLinks.find ( qState )->second.first = cloneState; + suffixLinks.find ( newState )->second.first = cloneState; } } - lastState = newState; } diff --git a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp index eeac98a506..fa23d8bf8e 100644 --- a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp @@ -72,8 +72,8 @@ void FactorOracleAutomatonTest::testBackwardOracleMatching ( ) { CPPUNIT_ASSERT ( res == expectedOccs[i] ); } - auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); - auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject ); std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( longSubject, longPattern ); std::cout << "long: " << res << std::endl; CPPUNIT_ASSERT ( res.size() > 0 ); diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp index be2bbe4416..bee259d485 100644 --- a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp @@ -2,6 +2,7 @@ #include "string/LinearString.h" #include "stringology/exact/SuffixAutomaton.hpp" +#include "stringology/exact/BackwardDAWGMatching.hpp" #include "string/generate/RandomStringFactory.h" #include "string/generate/RandomSubstringFactory.h" @@ -47,3 +48,37 @@ void SuffixAutomatonTest::testSuffixAutomatonConstruction ( ) { CPPUNIT_ASSERT ( suffixAutomaton == refSuffixAutomaton ); } + +void SuffixAutomatonTest::testBackwardDAWGMatching ( ) { + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + + for(size_t i = 0; i < subjects.size(); ++i) { + string::String subject = string::stringFrom ( subjects[i] ); + string::String pattern = string::stringFrom ( patterns[i] ); + std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( subject, pattern ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject ); + std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res.size() > 0 ); + +} + diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h index edfc02045c..2154792b1d 100644 --- a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h +++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h @@ -6,6 +6,7 @@ class SuffixAutomatonTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE ( SuffixAutomatonTest ); CPPUNIT_TEST ( testSuffixAutomatonConstruction ); + CPPUNIT_TEST ( testBackwardDAWGMatching ); CPPUNIT_TEST_SUITE_END ( ); public: @@ -13,6 +14,8 @@ public: void tearDown ( ); void testSuffixAutomatonConstruction ( ); + void testBackwardDAWGMatching ( ); + }; #endif // SUFFIX_AUTOMATON_TEST_HPP_ -- GitLab