Newer
Older
/*
* Author: Radovan Cerveny
*/
#include "BackwardNondeterministicDAWGMatching.hpp"
#include <string/LinearString.h>
#include <alphabet/Symbol.h>
#include <map>
#include <measure>
namespace stringology {
namespace exact {
template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) {
return dispatch ( subject.getData ( ), pattern.getData ( ) );
template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
std::set < unsigned > occ;
// Setup helper variables
using BitmaskType = std::bitset < BitmaskBitCount >;
bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( );
size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( );
measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
std::map < DefaultSymbolType, BitmaskType > symbolBitmaskLookupTable;
// Initialize the bitmasks with zeros for each symbol in the alphabet
for ( const auto & symbol : pattern.getAlphabet ( ) )
symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 );
// Mark the position in the bitmask for each symbol in the pattern
for ( size_t i = 0; i < bitmaskLength; i++ )
symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 );
measurements::end ( );
measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
size_t posInSubject = 0;
BitmaskType currentBitmask;
while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
size_t posInPattern = bitmaskLength;
size_t lastPosOfFactor = bitmaskLength;
// Set the bitmask to all ones
currentBitmask.set ( );
while ( posInPattern > 0 && currentBitmask.any ( ) ) {
currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
posInPattern--;
// Test whether the most significant bit is set
if ( currentBitmask.test ( bitmaskLength - 1 ) ) {
if ( posInPattern > 0 ) {
lastPosOfFactor = posInPattern;
} else {
if ( !patternIsLong ) {
// Yay, there is match!!!
occ.insert ( posInSubject );
} else {
// if the pattern is longer then BitmaskBitCount characters switch to brute force check
size_t k = bitmaskLength;
while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;
if ( k == pattern.getContent ( ).size ( ) )
// Yay, there is match!!!
occ.insert ( posInSubject );
}
}
}
currentBitmask <<= 1;
}
posInSubject += lastPosOfFactor;
}
measurements::end ( );
return occ;
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}
auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString < >, string::LinearString < > > ( BackwardNondeterministicDAWGMatching::match );
} /* namespace exact */
} /* namespace stringology */