Skip to content
Snippets Groups Projects
BackwardNondeterministicDAWGMatching.cpp 5.35 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Author: Radovan Cerveny
     */
    
    #include "BackwardNondeterministicDAWGMatching.hpp"
    
    #include <string/LinearString.h>
    #include <alphabet/Symbol.h>
    
    #include <map>
    
    #include <measure>
    
    namespace stringology {
    
    namespace exact {
    
    
    template < size_t BitmaskBitCount >
    std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) {
    
        return dispatch ( subject.getData ( ), pattern.getData ( ) );
    
    template < size_t BitmaskBitCount >
    std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString & subject, const string::LinearString & pattern ) {
    
        std::set < unsigned > occ;
    
         // Setup helper variables
    
        using BitmaskType = std::bitset < BitmaskBitCount >;
        bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( );
        size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( );
    
        measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
    
        std::map < alphabet::Symbol, BitmaskType > symbolBitmaskLookupTable;
    
    
         // Initialize the bitmasks with zeros for each symbol in the alphabet
        for ( const auto & symbol : pattern.getAlphabet ( ) )
    
            symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 );
    
    
         // Mark the position in the bitmask for each symbol in the pattern
        for ( size_t i = 0; i < bitmaskLength; i++ )
    
            symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 );
    
    
        measurements::end ( );
    
        measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
    
        size_t posInSubject = 0;
    
        BitmaskType currentBitmask;
    
    
        while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
            size_t posInPattern = bitmaskLength;
            size_t lastPosOfFactor = bitmaskLength;
    
    
             // Set the bitmask to all ones
            currentBitmask.set ( );
    
            while ( posInPattern > 0 && currentBitmask.any ( ) ) {
                currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
    
                 // Test whether the most significant bit is set
                if ( currentBitmask.test ( bitmaskLength - 1 ) ) {
    
                    if ( posInPattern > 0 ) {
                        lastPosOfFactor = posInPattern;
                    } else {
    
                             // Yay, there is match!!!
                            occ.insert ( posInSubject );
                        } else {
    
    Radovan Červený's avatar
    Radovan Červený committed
                             // if the pattern is longer then BitmaskBitCount characters switch to brute force check
    
                            size_t k = bitmaskLength;
    
                            while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;
    
                            if ( k == pattern.getContent ( ).size ( ) )
                                 // Yay, there is match!!!
                                occ.insert ( posInSubject );
                        }
                    }
                }
    
                currentBitmask <<= 1;
            }
    
            posInSubject += lastPosOfFactor;
        }
    
        measurements::end ( );
    
        return occ;
    }
    
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
        return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
        return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString & subject, const string::LinearString & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString & subject, const string::LinearString & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
    }
    
    std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString & subject, const string::LinearString & pattern ) {
        return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
    }
    
    
    auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardNondeterministicDAWGMatching::match );
    
    
    } /* namespace exact */
    
    } /* namespace stringology */