From b9bfb0bbb05264d4cf9ff96b96f638a9d991b85c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com>
Date: Fri, 1 Apr 2016 18:19:30 +0200
Subject: [PATCH] somewhat cleaner implementation of BNDM using bitset

---
 .../BackwardNondeterministicDAWGMatching.cpp  | 82 +++++++++++++------
 .../BackwardNondeterministicDAWGMatching.hpp  | 20 ++++-
 ...ckwardNondeterministicDAWGMatchingTest.cpp |  7 +-
 3 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
index 89a808fcb9..f170318045 100644
--- a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
+++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp
@@ -9,68 +9,67 @@
 #include <alphabet/Symbol.h>
 
 #include <map>
+#include <bitset>
 #include <measure>
 
 namespace stringology {
 
 namespace exact {
 
-std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
+template < size_t BitmaskBitCount >
+std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) {
     return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) );
 }
 
-std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
-    using Bitmask = unsigned long long int;
-
+template < size_t BitmaskBitCount >
+std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ) {
     std::set < unsigned > occ;
 
-    measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
-
-    std::map < alphabet::Symbol, Bitmask > symbolBitmaskLookupTable;
-    bool longPattern;
-    size_t bitmaskLength;
-    Bitmask highestBitBitmask;
-    Bitmask allOnesBitmask;
-
      // Setup helper variables
-    longPattern = 64 < pattern.getContent ( ).size ( );
-    bitmaskLength = longPattern ? 64 : pattern.getContent ( ).size ( );
+    using BitmaskType = std::bitset < BitmaskBitCount >;
+    bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( );
+    size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( );
+
+    measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
 
-    highestBitBitmask = 1ULL << ( bitmaskLength - 1 );
-    allOnesBitmask = ( highestBitBitmask << 1 ) - 1;
+    std::map < alphabet::Symbol, BitmaskType > symbolBitmaskLookupTable;
 
      // Initialize the bitmasks with zeros for each symbol in the alphabet
     for ( const auto & symbol : pattern.getAlphabet ( ) )
-        symbolBitmaskLookupTable[symbol] = 0ULL;
+        symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 );
 
      // Mark the position in the bitmask for each symbol in the pattern
     for ( size_t i = 0; i < bitmaskLength; i++ )
-        symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )] |= 1ULL << ( bitmaskLength - i - 1 );
+        symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 );
 
     measurements::end ( );
 
     measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
 
     size_t posInSubject = 0;
+    BitmaskType currentBitmask;
 
     while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
         size_t posInPattern = bitmaskLength;
         size_t lastPosOfFactor = bitmaskLength;
-        Bitmask currentBitmask = ~0ULL;
 
-        while ( currentBitmask != 0ULL ) {
-            currentBitmask = currentBitmask & symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
+         // Set the bitmask to all ones
+        currentBitmask.set ( );
+
+        while ( posInPattern > 0 && currentBitmask.any ( ) ) {
+            currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
             posInPattern--;
 
-            if ( ( currentBitmask & highestBitBitmask ) != 0 ) {
+             // Test whether the most significant bit is set
+            if ( currentBitmask.test ( bitmaskLength - 1 ) ) {
                 if ( posInPattern > 0 ) {
                     lastPosOfFactor = posInPattern;
                 } else {
-                    if ( !longPattern ) {
+                    if ( !patternIsLong ) {
                          // Yay, there is match!!!
                         occ.insert ( posInSubject );
                     } else {
-                         // if the pattern is longer then 64 characters switch to brute force check
+                         // if the pattern is longer then BITMASK_BIT_COUNT characters switch to brute force check
                         size_t k = bitmaskLength;
 
                         while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;
@@ -83,9 +82,6 @@ std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string
             }
 
             currentBitmask <<= 1;
-
-             // We need to trim excess ones in case the pattern is shorter then 64 characters
-            currentBitmask &= allOnesBitmask;
         }
 
         posInSubject += lastPosOfFactor;
@@ -96,6 +92,38 @@ std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string
     return occ;
 }
 
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
+    return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
+}
+
+std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
+}
+
 auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::getInstance ( ), BackwardNondeterministicDAWGMatching::match );
 
 } /* namespace exact */
diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
index 9426e0d154..5b43fe87b1 100644
--- a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
+++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp
@@ -20,15 +20,31 @@ namespace exact {
  * Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 64bit bitmask and brute force switch for longer patterns.
  */
 class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > {
-public:
+private:
     /**
      * Search for pattern in linear string.
      * @return set set of occurences
      */
-    static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
+    template <size_t BitmaskBitCount>
+    static std::set < unsigned > matchTemplate ( const string::String & subject, const string::String & pattern );
 
+    template <size_t BitmaskBitCount >
+    static std::set < unsigned > matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern );
+public:
+
+    static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
     static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern );
 
+    static std::set < unsigned > match32 ( const string::String & subject, const string::String & pattern );
+    static std::set < unsigned > match32 ( const string::LinearString & subject, const string::LinearString & pattern );
+
+    static std::set < unsigned > match64 ( const string::String & subject, const string::String & pattern );
+    static std::set < unsigned > match64 ( const string::LinearString & subject, const string::LinearString & pattern );
+
+    static std::set < unsigned > match128 ( const string::String & subject, const string::String & pattern );
+    static std::set < unsigned > match128 ( const string::LinearString & subject, const string::LinearString & pattern );
+
+
     static BackwardNondeterministicDAWGMatching & getInstance ( ) {
         static BackwardNondeterministicDAWGMatching res;
 
diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
index 65d8c4ad06..4059bf1f7b 100644
--- a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
+++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp
@@ -39,9 +39,12 @@ void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) {
         string::String subject = string::stringFrom ( subjects[i] );
         string::String pattern = string::stringFrom ( patterns[i] );
         std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern );
-
-        std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
         CPPUNIT_ASSERT ( res == expectedOccs[i] );
+        res = stringology::exact::BackwardNondeterministicDAWGMatching::match64 ( subject, pattern );
+        CPPUNIT_ASSERT ( res == expectedOccs[i] );
+        res = stringology::exact::BackwardNondeterministicDAWGMatching::match128 ( subject, pattern );
+        CPPUNIT_ASSERT ( res == expectedOccs[i] );
+        std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
     }
 
     auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true);
-- 
GitLab