From 16bede71112a8475ede5c23c9873f0012851bbbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com> Date: Fri, 1 Apr 2016 15:58:41 +0200 Subject: [PATCH] implemented backward nondeterministic dawg matching algo --- .../BackwardNondeterministicDAWGMatching.cpp | 103 ++++++++++++++++++ .../BackwardNondeterministicDAWGMatching.hpp | 44 ++++++++ ...ckwardNondeterministicDAWGMatchingTest.cpp | 53 +++++++++ ...BackwardNondeterministicDAWGMatchingTest.h | 18 +++ astringology2/src/astringology.cpp | 15 +++ 5 files changed, 233 insertions(+) create mode 100644 alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp create mode 100644 alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp create mode 100644 alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp create mode 100644 alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp new file mode 100644 index 0000000000..89a808fcb9 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp @@ -0,0 +1,103 @@ +/* + * Author: Radovan Cerveny + */ + +#include "BackwardNondeterministicDAWGMatching.hpp" + +#include <exception/AlibException.h> +#include <string/LinearString.h> +#include <alphabet/Symbol.h> + +#include <map> +#include <measure> + +namespace stringology { + +namespace exact { + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + using Bitmask = unsigned long long int; + + std::set < unsigned > occ; + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + + std::map < alphabet::Symbol, Bitmask > symbolBitmaskLookupTable; + bool longPattern; + size_t bitmaskLength; + Bitmask highestBitBitmask; + Bitmask allOnesBitmask; + + // Setup helper variables + longPattern = 64 < pattern.getContent ( ).size ( ); + bitmaskLength = longPattern ? 64 : pattern.getContent ( ).size ( ); + + highestBitBitmask = 1ULL << ( bitmaskLength - 1 ); + allOnesBitmask = ( highestBitBitmask << 1 ) - 1; + + // Initialize the bitmasks with zeros for each symbol in the alphabet + for ( const auto & symbol : pattern.getAlphabet ( ) ) + symbolBitmaskLookupTable[symbol] = 0ULL; + + // Mark the position in the bitmask for each symbol in the pattern + for ( size_t i = 0; i < bitmaskLength; i++ ) + symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )] |= 1ULL << ( bitmaskLength - i - 1 ); + + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + + size_t posInSubject = 0; + + while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { + size_t posInPattern = bitmaskLength; + size_t lastPosOfFactor = bitmaskLength; + Bitmask currentBitmask = ~0ULL; + + while ( currentBitmask != 0ULL ) { + currentBitmask = currentBitmask & symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )]; + posInPattern--; + + if ( ( currentBitmask & highestBitBitmask ) != 0 ) { + if ( posInPattern > 0 ) { + lastPosOfFactor = posInPattern; + } else { + if ( !longPattern ) { + // Yay, there is match!!! + occ.insert ( posInSubject ); + } else { + // if the pattern is longer then 64 characters switch to brute force check + size_t k = bitmaskLength; + + while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; + + if ( k == pattern.getContent ( ).size ( ) ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + } + } + } + + currentBitmask <<= 1; + + // We need to trim excess ones in case the pattern is shorter then 64 characters + currentBitmask &= allOnesBitmask; + } + + posInSubject += lastPosOfFactor; + } + + measurements::end ( ); + + return occ; +} + +auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::getInstance ( ), BackwardNondeterministicDAWGMatching::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp new file mode 100644 index 0000000000..9426e0d154 --- /dev/null +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp @@ -0,0 +1,44 @@ + +/* + * Author: Radovan Cerveny + */ + +#ifndef _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_ +#define _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_ + +#include <string/String.h> +#include <string/StringFeatures.h> +#include <core/multipleDispatch.hpp> + +#include <set> + +namespace stringology { + +namespace exact { + +/** + * Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 64bit bitmask and brute force switch for longer patterns. + */ +class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + + static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + + static BackwardNondeterministicDAWGMatching & getInstance ( ) { + static BackwardNondeterministicDAWGMatching res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_ */ diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp new file mode 100644 index 0000000000..65d8c4ad06 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp @@ -0,0 +1,53 @@ +#include "BackwardNondeterministicDAWGMatchingTest.h" + +#include "string/String.h" +#include "stringology/exact/BackwardNondeterministicDAWGMatching.hpp" + +#include "string/generate/RandomStringFactory.h" +#include "string/generate/RandomSubstringFactory.h" + +#define CPPUNIT_IMPLY( x, y ) CPPUNIT_ASSERT ( !( x ) || ( y ) ) + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest ); + +void BackwardNondeterministicDAWGMatchingTest::setUp ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::tearDown ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { + + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + + for(size_t i = 0; i < subjects.size(); ++i) { + string::String subject = string::stringFrom ( subjects[i] ); + string::String pattern = string::stringFrom ( patterns[i] ); + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); + + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res.size() > 0 ); + +} diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h new file mode 100644 index 0000000000..6d1116d691 --- /dev/null +++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h @@ -0,0 +1,18 @@ +#ifndef BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST +#define BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST + +#include <cppunit/extensions/HelperMacros.h> + +class BackwardNondeterministicDAWGMatchingTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE ( BackwardNondeterministicDAWGMatchingTest ); + CPPUNIT_TEST ( testBNDM ); + CPPUNIT_TEST_SUITE_END ( ); + +public: + void setUp ( ); + void tearDown ( ); + + void testBNDM ( ); +}; + +#endif // BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp index 42e9daa01c..52352d0cca 100644 --- a/astringology2/src/astringology.cpp +++ b/astringology2/src/astringology.cpp @@ -23,6 +23,7 @@ #include <stringology/exact/BoyerMooreHorspool.h> #include <stringology/exact/ReversedBoyerMooreHorspool.h> #include <stringology/exact/DeadZoneUsingBadCharacterShift.h> +#include <stringology/exact/BackwardNondeterministicDAWGMatching.hpp> #include <stringology/exact/ExactMatchingAutomaton.h> #include <stringology/exact/ExactFactorAutomaton.h> #include <stringology/exact/ExactSubsequenceAutomaton.h> @@ -45,6 +46,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "boyerMooreHorspool" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); allowed.push_back ( "deadZoneUsingBadCharacterShift" ); + allowed.push_back ( "backwardNondeterministicDAWGMatching" ); allowed.push_back ( "borderArray" ); allowed.push_back ( "suffixTrie" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); @@ -125,6 +127,19 @@ int main ( int argc, char * argv[] ) { measurements::end ( ); measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "backwardNondeterministicDAWGMatching" ) { + string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); } else if ( algorithm.getValue ( ) == "exactMatchingAutomaton" ) { string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); -- GitLab