From 16bede71112a8475ede5c23c9873f0012851bbbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com>
Date: Fri, 1 Apr 2016 15:58:41 +0200
Subject: [PATCH] implemented backward nondeterministic dawg matching algo

---
 .../BackwardNondeterministicDAWGMatching.cpp  | 103 ++++++++++++++++++
 .../BackwardNondeterministicDAWGMatching.hpp  |  44 ++++++++
 ...ckwardNondeterministicDAWGMatchingTest.cpp |  53 +++++++++
 ...BackwardNondeterministicDAWGMatchingTest.h |  18 +++
 astringology2/src/astringology.cpp            |  15 +++
 5 files changed, 233 insertions(+)
 create mode 100644 alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
 create mode 100644 alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
 create mode 100644 alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
 create mode 100644 alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h

diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
new file mode 100644
index 0000000000..89a808fcb9
--- /dev/null
+++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
@@ -0,0 +1,103 @@
+/*
+ * Author: Radovan Cerveny
+ */
+
+#include "BackwardNondeterministicDAWGMatching.hpp"
+
+#include <exception/AlibException.h>
+#include <string/LinearString.h>
+#include <alphabet/Symbol.h>
+
+#include <map>
+#include <measure>
+
+namespace stringology {
+
+namespace exact {
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
+    return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    using Bitmask = unsigned long long int;
+
+    std::set < unsigned > occ;
+
+    measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
+
+    std::map < alphabet::Symbol, Bitmask > symbolBitmaskLookupTable;
+    bool longPattern;
+    size_t bitmaskLength;
+    Bitmask highestBitBitmask;
+    Bitmask allOnesBitmask;
+
+     // Setup helper variables
+    longPattern = 64 < pattern.getContent ( ).size ( );
+    bitmaskLength = longPattern ? 64 : pattern.getContent ( ).size ( );
+
+    highestBitBitmask = 1ULL << ( bitmaskLength - 1 );
+    allOnesBitmask = ( highestBitBitmask << 1 ) - 1;
+
+     // Initialize the bitmasks with zeros for each symbol in the alphabet
+    for ( const auto & symbol : pattern.getAlphabet ( ) )
+        symbolBitmaskLookupTable[symbol] = 0ULL;
+
+     // Mark the position in the bitmask for each symbol in the pattern
+    for ( size_t i = 0; i < bitmaskLength; i++ )
+        symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )] |= 1ULL << ( bitmaskLength - i - 1 );
+
+    measurements::end ( );
+
+    measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
+
+    size_t posInSubject = 0;
+
+    while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
+        size_t posInPattern = bitmaskLength;
+        size_t lastPosOfFactor = bitmaskLength;
+        Bitmask currentBitmask = ~0ULL;
+
+        while ( currentBitmask != 0ULL ) {
+            currentBitmask = currentBitmask & symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
+            posInPattern--;
+
+            if ( ( currentBitmask & highestBitBitmask ) != 0 ) {
+                if ( posInPattern > 0 ) {
+                    lastPosOfFactor = posInPattern;
+                } else {
+                    if ( !longPattern ) {
+                         // Yay, there is match!!!
+                        occ.insert ( posInSubject );
+                    } else {
+                         // if the pattern is longer then 64 characters switch to brute force check
+                        size_t k = bitmaskLength;
+
+                        while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;
+
+                        if ( k == pattern.getContent ( ).size ( ) )
+                             // Yay, there is match!!!
+                            occ.insert ( posInSubject );
+                    }
+                }
+            }
+
+            currentBitmask <<= 1;
+
+             // We need to trim excess ones in case the pattern is shorter then 64 characters
+            currentBitmask &= allOnesBitmask;
+        }
+
+        posInSubject += lastPosOfFactor;
+    }
+
+    measurements::end ( );
+
+    return occ;
+}
+
+auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::getInstance ( ), BackwardNondeterministicDAWGMatching::match );
+
+} /* namespace exact */
+
+} /* namespace stringology */
diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
new file mode 100644
index 0000000000..9426e0d154
--- /dev/null
+++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
@@ -0,0 +1,44 @@
+
+/*
+ * Author: Radovan Cerveny
+ */
+
+#ifndef _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_
+#define _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_
+
+#include <string/String.h>
+#include <string/StringFeatures.h>
+#include <core/multipleDispatch.hpp>
+
+#include <set>
+
+namespace stringology {
+
+namespace exact {
+
+/**
+ * Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 64bit bitmask and brute force switch for longer patterns.
+ */
+class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > {
+public:
+    /**
+     * Search for pattern in linear string.
+     * @return set set of occurences
+     */
+    static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
+
+    static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern );
+
+    static BackwardNondeterministicDAWGMatching & getInstance ( ) {
+        static BackwardNondeterministicDAWGMatching res;
+
+        return res;
+    }
+
+};
+
+} /* namespace exact */
+
+} /* namespace stringology */
+
+#endif /* _STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_H_ */
diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
new file mode 100644
index 0000000000..65d8c4ad06
--- /dev/null
+++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
@@ -0,0 +1,53 @@
+#include "BackwardNondeterministicDAWGMatchingTest.h"
+
+#include "string/String.h"
+#include "stringology/exact/BackwardNondeterministicDAWGMatching.hpp"
+
+#include "string/generate/RandomStringFactory.h"
+#include "string/generate/RandomSubstringFactory.h"
+
+#define CPPUNIT_IMPLY( x, y )    CPPUNIT_ASSERT ( !( x ) || ( y ) )
+
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest, "stringology" );
+CPPUNIT_TEST_SUITE_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest );
+
+void BackwardNondeterministicDAWGMatchingTest::setUp ( ) {
+}
+
+void BackwardNondeterministicDAWGMatchingTest::tearDown ( ) {
+}
+
+void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) {
+
+    std::vector<std::string> subjects;
+    std::vector<std::string> patterns;
+    std::vector<std::set<unsigned>> expectedOccs;
+
+    subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0});
+    subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0});
+    subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0});
+    subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0});
+    subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8});
+    subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9});
+
+
+    for(size_t i = 0; i < subjects.size(); ++i) {
+        string::String subject = string::stringFrom ( subjects[i] );
+        string::String pattern = string::stringFrom ( patterns[i] );
+        std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern );
+
+        std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
+        CPPUNIT_ASSERT ( res == expectedOccs[i] );
+    }
+
+    auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true);
+    auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject );
+    std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( longSubject, longPattern );
+    std::cout << "long: " << res << std::endl;
+    CPPUNIT_ASSERT ( res.size() > 0 );
+
+}
diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h
new file mode 100644
index 0000000000..6d1116d691
--- /dev/null
+++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.h
@@ -0,0 +1,18 @@
+#ifndef BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST
+#define BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST
+
+#include <cppunit/extensions/HelperMacros.h>
+
+class BackwardNondeterministicDAWGMatchingTest : public CppUnit::TestFixture {
+    CPPUNIT_TEST_SUITE ( BackwardNondeterministicDAWGMatchingTest );
+    CPPUNIT_TEST ( testBNDM );
+    CPPUNIT_TEST_SUITE_END ( );
+
+public:
+    void setUp ( );
+    void tearDown ( );
+
+    void testBNDM ( );
+};
+
+#endif // BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST
diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp
index 42e9daa01c..52352d0cca 100644
--- a/astringology2/src/astringology.cpp
+++ b/astringology2/src/astringology.cpp
@@ -23,6 +23,7 @@
 #include <stringology/exact/BoyerMooreHorspool.h>
 #include <stringology/exact/ReversedBoyerMooreHorspool.h>
 #include <stringology/exact/DeadZoneUsingBadCharacterShift.h>
+#include <stringology/exact/BackwardNondeterministicDAWGMatching.hpp>
 #include <stringology/exact/ExactMatchingAutomaton.h>
 #include <stringology/exact/ExactFactorAutomaton.h>
 #include <stringology/exact/ExactSubsequenceAutomaton.h>
@@ -45,6 +46,7 @@ int main ( int argc, char * argv[] ) {
 		allowed.push_back ( "boyerMooreHorspool" );
 		allowed.push_back ( "reversedBoyerMooreHorspool" );
 		allowed.push_back ( "deadZoneUsingBadCharacterShift" );
+		allowed.push_back ( "backwardNondeterministicDAWGMatching" );
 		allowed.push_back ( "borderArray" );
 		allowed.push_back ( "suffixTrie" );
 		TCLAP::ValuesConstraint < std::string > allowedVals ( allowed );
@@ -125,6 +127,19 @@ int main ( int argc, char * argv[] ) {
 			measurements::end ( );
 			measurements::start ( "Output write", measurements::Type::AUXILIARY );
 
+			alib::XmlDataFactory::toStdout ( res );
+		} else if ( algorithm.getValue ( ) == "backwardNondeterministicDAWGMatching" ) {
+			string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) );
+			string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) );
+
+			measurements::end ( );
+			measurements::start ( "Algorithm", measurements::Type::MAIN );
+
+			std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern );
+
+			measurements::end ( );
+			measurements::start ( "Output write", measurements::Type::AUXILIARY );
+
 			alib::XmlDataFactory::toStdout ( res );
 		} else if ( algorithm.getValue ( ) == "exactMatchingAutomaton" ) {
 			string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) );
-- 
GitLab