Skip to content
Snippets Groups Projects
Commit e41c9c0c authored by Radovan Červený's avatar Radovan Červený
Browse files

suffix automaton construction fix, backward dawg matching implemented

parent ba0c2010
Branches
Tags
1 merge request!16BP_cervera3 - algorithms
/*
* Author: Radovan Cerveny
*/
#include "BackwardDAWGMatching.hpp"
#include "SuffixAutomaton.hpp"
#include <exception/AlibException.h>
#include <string/LinearString.h>
#include <alphabet/Symbol.h>
#include <algorithm>
#include <map>
#include <bitset>
#include <measure>
namespace stringology {
namespace exact {
std::set < unsigned > BackwardDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) );
}
std::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
std::set < unsigned > occ;
measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
auto patternData = pattern.getContent ( );
reverse ( patternData.begin ( ), patternData.end ( ) );
const string::LinearString reversedPattern ( std::move ( patternData ) );
automaton::DFA suffixAutomaton = SuffixAutomaton::construct ( reversedPattern );
measurements::end ( );
measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
const automaton::State failState = automaton::State ( -1 );
size_t posInSubject = 0;
while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
automaton::State currentState = suffixAutomaton.getInitialState ( );
size_t posInPattern = reversedPattern.getContent ( ).size ( );
while ( posInPattern > 0 && currentState != failState ) {
auto transition = suffixAutomaton.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } );
if ( transition == suffixAutomaton.getTransitions ( ).end ( ) )
currentState = failState;
else
currentState = transition->second;
posInPattern--;
}
if ( currentState != failState )
// Yay, there is match!!!
occ.insert ( posInSubject );
posInSubject += posInPattern + 1;
}
measurements::end ( );
return occ;
}
auto BackwardDAWGMatchingLinearStringLinearString = BackwardDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardDAWGMatching::getInstance ( ), BackwardDAWGMatching::match );
} /* namespace exact */
} /* namespace stringology */
/*
* Author: Radovan Cerveny
*/
#ifndef STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__
#define STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__
#include <string/String.h>
#include <string/StringFeatures.h>
#include <core/multipleDispatch.hpp>
#include <set>
namespace stringology {
namespace exact {
/**
* Implementation of Backward DAWG Matching.
*/
class BackwardDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > {
private:
public:
/**
* Search for pattern in linear string.
* @return set set of occurences
*/
static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern );
static BackwardDAWGMatching & getInstance ( ) {
static BackwardDAWGMatching res;
return res;
}
};
} /* namespace exact */
} /* namespace stringology */
#endif /* STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ */
...@@ -108,13 +108,18 @@ void SuffixAutomaton::suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomato ...@@ -108,13 +108,18 @@ void SuffixAutomaton::suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomato
for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( qState ) ) for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( qState ) )
suffixAutomaton.addTransition ( cloneState, transition.first.second, transition.second ); suffixAutomaton.addTransition ( cloneState, transition.first.second, transition.second );
   
while ( kState != automaton::State ( -1 ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) { while ( kState != automaton::State ( -1 )
&& suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( )
&& suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) {
suffixAutomaton.removeTransition ( kState, symbol, qState );
suffixAutomaton.addTransition ( kState, symbol, cloneState ); suffixAutomaton.addTransition ( kState, symbol, cloneState );
kState = suffixLinks.find ( kState )->second.first; kState = suffixLinks.find ( kState )->second.first;
} }
suffixLinks.find ( qState )->second.first = cloneState;
suffixLinks.find ( newState )->second.first = cloneState;
} }
} }
lastState = newState; lastState = newState;
} }
   
......
...@@ -72,8 +72,8 @@ void FactorOracleAutomatonTest::testBackwardOracleMatching ( ) { ...@@ -72,8 +72,8 @@ void FactorOracleAutomatonTest::testBackwardOracleMatching ( ) {
CPPUNIT_ASSERT ( res == expectedOccs[i] ); CPPUNIT_ASSERT ( res == expectedOccs[i] );
} }
   
auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true);
auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject );
std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( longSubject, longPattern ); std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( longSubject, longPattern );
std::cout << "long: " << res << std::endl; std::cout << "long: " << res << std::endl;
CPPUNIT_ASSERT ( res.size() > 0 ); CPPUNIT_ASSERT ( res.size() > 0 );
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
   
#include "string/LinearString.h" #include "string/LinearString.h"
#include "stringology/exact/SuffixAutomaton.hpp" #include "stringology/exact/SuffixAutomaton.hpp"
#include "stringology/exact/BackwardDAWGMatching.hpp"
   
#include "string/generate/RandomStringFactory.h" #include "string/generate/RandomStringFactory.h"
#include "string/generate/RandomSubstringFactory.h" #include "string/generate/RandomSubstringFactory.h"
...@@ -47,3 +48,37 @@ void SuffixAutomatonTest::testSuffixAutomatonConstruction ( ) { ...@@ -47,3 +48,37 @@ void SuffixAutomatonTest::testSuffixAutomatonConstruction ( ) {
   
CPPUNIT_ASSERT ( suffixAutomaton == refSuffixAutomaton ); CPPUNIT_ASSERT ( suffixAutomaton == refSuffixAutomaton );
} }
void SuffixAutomatonTest::testBackwardDAWGMatching ( ) {
std::vector<std::string> subjects;
std::vector<std::string> patterns;
std::vector<std::set<unsigned>> expectedOccs;
subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0});
subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({});
subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0});
subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({});
subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({});
subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0});
subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({});
subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0});
subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8});
subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9});
for(size_t i = 0; i < subjects.size(); ++i) {
string::String subject = string::stringFrom ( subjects[i] );
string::String pattern = string::stringFrom ( patterns[i] );
std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( subject, pattern );
std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
CPPUNIT_ASSERT ( res == expectedOccs[i] );
}
auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true);
auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject );
std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( longSubject, longPattern );
std::cout << "long: " << res << std::endl;
CPPUNIT_ASSERT ( res.size() > 0 );
}
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
class SuffixAutomatonTest : public CppUnit::TestFixture { class SuffixAutomatonTest : public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE ( SuffixAutomatonTest ); CPPUNIT_TEST_SUITE ( SuffixAutomatonTest );
CPPUNIT_TEST ( testSuffixAutomatonConstruction ); CPPUNIT_TEST ( testSuffixAutomatonConstruction );
CPPUNIT_TEST ( testBackwardDAWGMatching );
CPPUNIT_TEST_SUITE_END ( ); CPPUNIT_TEST_SUITE_END ( );
   
public: public:
...@@ -13,6 +14,8 @@ public: ...@@ -13,6 +14,8 @@ public:
void tearDown ( ); void tearDown ( );
   
void testSuffixAutomatonConstruction ( ); void testSuffixAutomatonConstruction ( );
void testBackwardDAWGMatching ( );
}; };
   
#endif // SUFFIX_AUTOMATON_TEST_HPP_ #endif // SUFFIX_AUTOMATON_TEST_HPP_
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment