Skip to content
Snippets Groups Projects
BackwardNondeterministicDAWGMatching.cpp 5.4 KiB
Newer Older
/*
 * Author: Radovan Cerveny
 */

#include "BackwardNondeterministicDAWGMatching.hpp"

#include <string/LinearString.h>
#include <alphabet/Symbol.h>

#include <map>
#include <measure>

namespace stringology {

namespace exact {

template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) {
    return dispatch ( subject.getData ( ), pattern.getData ( ) );
template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
    std::set < unsigned > occ;

     // Setup helper variables
    using BitmaskType = std::bitset < BitmaskBitCount >;
    bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( );
    size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( );

    measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
    std::map < DefaultSymbolType, BitmaskType > symbolBitmaskLookupTable;

     // Initialize the bitmasks with zeros for each symbol in the alphabet
    for ( const auto & symbol : pattern.getAlphabet ( ) )
        symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 );

     // Mark the position in the bitmask for each symbol in the pattern
    for ( size_t i = 0; i < bitmaskLength; i++ )
        symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 );

    measurements::end ( );

    measurements::start ( "Algorithm", measurements::Type::ALGORITHM );

    size_t posInSubject = 0;
    BitmaskType currentBitmask;

    while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
        size_t posInPattern = bitmaskLength;
        size_t lastPosOfFactor = bitmaskLength;

         // Set the bitmask to all ones
        currentBitmask.set ( );

        while ( posInPattern > 0 && currentBitmask.any ( ) ) {
            currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
             // Test whether the most significant bit is set
            if ( currentBitmask.test ( bitmaskLength - 1 ) ) {
                if ( posInPattern > 0 ) {
                    lastPosOfFactor = posInPattern;
                } else {
                         // Yay, there is match!!!
                        occ.insert ( posInSubject );
                    } else {
Radovan Červený's avatar
Radovan Červený committed
                         // if the pattern is longer then BitmaskBitCount characters switch to brute force check
                        size_t k = bitmaskLength;

                        while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;

                        if ( k == pattern.getContent ( ).size ( ) )
                             // Yay, there is match!!!
                            occ.insert ( posInSubject );
                    }
                }
            }

            currentBitmask <<= 1;
        }

        posInSubject += lastPosOfFactor;
    }

    measurements::end ( );

    return occ;
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
    return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
    return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}

std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
    return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}

auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString < >, string::LinearString < > > ( BackwardNondeterministicDAWGMatching::match );

} /* namespace exact */

} /* namespace stringology */