/* * Author: Radovan Cerveny */ #include "BackwardNondeterministicDAWGMatching.hpp" #include <string/LinearString.h> #include <alphabet/Symbol.h> #include <map> #include <bitset> #include <measure> namespace stringology { namespace exact { template < size_t BitmaskBitCount > std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) { return dispatch ( subject.getData ( ), pattern.getData ( ) ); } template < size_t BitmaskBitCount > std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) { std::set < unsigned > occ; // Setup helper variables using BitmaskType = std::bitset < BitmaskBitCount >; bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( ); size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( ); measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); std::map < DefaultSymbolType, BitmaskType > symbolBitmaskLookupTable; // Initialize the bitmasks with zeros for each symbol in the alphabet for ( const auto & symbol : pattern.getAlphabet ( ) ) symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 ); // Mark the position in the bitmask for each symbol in the pattern for ( size_t i = 0; i < bitmaskLength; i++ ) symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 ); measurements::end ( ); measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); size_t posInSubject = 0; BitmaskType currentBitmask; while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) { size_t posInPattern = bitmaskLength; size_t lastPosOfFactor = bitmaskLength; // Set the bitmask to all ones currentBitmask.set ( ); while ( posInPattern > 0 && currentBitmask.any ( ) ) { currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )]; posInPattern--; // Test whether the most significant bit is set if ( currentBitmask.test ( bitmaskLength - 1 ) ) { if ( posInPattern > 0 ) { lastPosOfFactor = posInPattern; } else { if ( !patternIsLong ) { // Yay, there is match!!! occ.insert ( posInSubject ); } else { // if the pattern is longer then BitmaskBitCount characters switch to brute force check size_t k = bitmaskLength; while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; if ( k == pattern.getContent ( ).size ( ) ) // Yay, there is match!!! occ.insert ( posInSubject ); } } } currentBitmask <<= 1; } posInSubject += lastPosOfFactor; } measurements::end ( ); return occ; } std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) { return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) { return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); } std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) { return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern ); } auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString < >, string::LinearString < > > ( BackwardNondeterministicDAWGMatching::match ); } /* namespace exact */ } /* namespace stringology */