From b9bfb0bbb05264d4cf9ff96b96f638a9d991b85c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com> Date: Fri, 1 Apr 2016 18:19:30 +0200 Subject: [PATCH] somewhat cleaner implementation of BNDM using bitset --- .../BackwardNondeterministicDAWGMatching.cpp | 82 +++++++++++++------ .../BackwardNondeterministicDAWGMatching.hpp | 20 ++++- ...ckwardNondeterministicDAWGMatchingTest.cpp | 7 +- 3 files changed, 78 insertions(+), 31 deletions(-) diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp index 89a808fcb9..f170318045 100644 --- a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.cpp @@ -9,68 +9,67 @@ #include <alphabet/Symbol.h> #include <map> +#include <bitset> #include <measure> namespace stringology { namespace exact { -std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { +template < size_t BitmaskBitCount > +std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) { return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); } -std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { - using Bitmask = unsigned long long int; - +template < size_t BitmaskBitCount > +std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ) { std::set < unsigned > occ; - measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); - - std::map < alphabet::Symbol, Bitmask > symbolBitmaskLookupTable; - bool longPattern; - size_t bitmaskLength; - Bitmask highestBitBitmask; - Bitmask allOnesBitmask; - // Setup helper variables - longPattern = 64 < pattern.getContent ( ).size ( ); - bitmaskLength = longPattern ? 64 : pattern.getContent ( ).size ( ); + using BitmaskType = std::bitset < BitmaskBitCount >; + bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( ); + size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( ); + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); - highestBitBitmask = 1ULL << ( bitmaskLength - 1 ); - allOnesBitmask = ( highestBitBitmask << 1 ) - 1; + std::map < alphabet::Symbol, BitmaskType > symbolBitmaskLookupTable; // Initialize the bitmasks with zeros for each symbol in the alphabet for ( const auto & symbol : pattern.getAlphabet ( ) ) - symbolBitmaskLookupTable[symbol] = 0ULL; + symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 ); // Mark the position in the bitmask for each symbol in the pattern for ( size_t i = 0; i < bitmaskLength; i++ ) - symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )] |= 1ULL << ( bitmaskLength - i - 1 ); + symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 ); measurements::end ( ); measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); size_t posInSubject = 0; + BitmaskType currentBitmask; while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { size_t posInPattern = bitmaskLength; size_t lastPosOfFactor = bitmaskLength; - Bitmask currentBitmask = ~0ULL; - while ( currentBitmask != 0ULL ) { - currentBitmask = currentBitmask & symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )]; + // Set the bitmask to all ones + currentBitmask.set ( ); + + while ( posInPattern > 0 && currentBitmask.any ( ) ) { + currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )]; posInPattern--; - if ( ( currentBitmask & highestBitBitmask ) != 0 ) { + // Test whether the most significant bit is set + if ( currentBitmask.test ( bitmaskLength - 1 ) ) { if ( posInPattern > 0 ) { lastPosOfFactor = posInPattern; } else { - if ( !longPattern ) { + if ( !patternIsLong ) { // Yay, there is match!!! occ.insert ( posInSubject ); } else { - // if the pattern is longer then 64 characters switch to brute force check + // if the pattern is longer then BITMASK_BIT_COUNT characters switch to brute force check size_t k = bitmaskLength; while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; @@ -83,9 +82,6 @@ std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string } currentBitmask <<= 1; - - // We need to trim excess ones in case the pattern is shorter then 64 characters - currentBitmask &= allOnesBitmask; } posInSubject += lastPosOfFactor; @@ -96,6 +92,38 @@ std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string return occ; } +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); +} + +std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString & subject, const string::LinearString & pattern ) { + return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); +} + auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::getInstance ( ), BackwardNondeterministicDAWGMatching::match ); } /* namespace exact */ diff --git a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp index 9426e0d154..5b43fe87b1 100644 --- a/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp +++ b/alib2algo/src/stringology/exact/BackwardNondeterministicDAWGMatching.hpp @@ -20,15 +20,31 @@ namespace exact { * Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 64bit bitmask and brute force switch for longer patterns. */ class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > { -public: +private: /** * Search for pattern in linear string. * @return set set of occurences */ - static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); + template <size_t BitmaskBitCount> + static std::set < unsigned > matchTemplate ( const string::String & subject, const string::String & pattern ); + template <size_t BitmaskBitCount > + static std::set < unsigned > matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ); +public: + + static std::set < unsigned > match ( const string::String & subject, const string::String & pattern ); static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern ); + static std::set < unsigned > match32 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match32 ( const string::LinearString & subject, const string::LinearString & pattern ); + + static std::set < unsigned > match64 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match64 ( const string::LinearString & subject, const string::LinearString & pattern ); + + static std::set < unsigned > match128 ( const string::String & subject, const string::String & pattern ); + static std::set < unsigned > match128 ( const string::LinearString & subject, const string::LinearString & pattern ); + + static BackwardNondeterministicDAWGMatching & getInstance ( ) { static BackwardNondeterministicDAWGMatching res; diff --git a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp index 65d8c4ad06..4059bf1f7b 100644 --- a/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp +++ b/alib2algo/test-src/stringology/exact/BackwardNondeterministicDAWGMatchingTest.cpp @@ -39,9 +39,12 @@ void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { string::String subject = string::stringFrom ( subjects[i] ); string::String pattern = string::stringFrom ( patterns[i] ); std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); - - std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; CPPUNIT_ASSERT ( res == expectedOccs[i] ); + res = stringology::exact::BackwardNondeterministicDAWGMatching::match64 ( subject, pattern ); + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + res = stringology::exact::BackwardNondeterministicDAWGMatching::match128 ( subject, pattern ); + CPPUNIT_ASSERT ( res == expectedOccs[i] ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl; } auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); -- GitLab