Skip to content
Snippets Groups Projects
Commit 1d66e564 authored by Jan Trávníček's avatar Jan Trávníček
Browse files

remove BNDM from experimental

parent 2af48bbe
No related branches found
No related tags found
No related merge requests found
Pipeline #
/*
* Author: Radovan Cerveny
*/
#include "BackwardNondeterministicDAWGMatching.hpp"
#include <string/LinearString.h>
#include <alphabet/Symbol.h>
#include <map>
#include <bitset>
#include <measure>
namespace stringology {
namespace exact {
template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::String & subject, const string::String & pattern ) {
return dispatch ( subject.getData ( ), pattern.getData ( ) );
}
template < size_t BitmaskBitCount >
std::set < unsigned > BackwardNondeterministicDAWGMatching::matchTemplate ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
std::set < unsigned > occ;
// Setup helper variables
using BitmaskType = std::bitset < BitmaskBitCount >;
bool patternIsLong = BitmaskBitCount < pattern.getContent ( ).size ( );
size_t bitmaskLength = patternIsLong ? BitmaskBitCount : pattern.getContent ( ).size ( );
measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
std::map < DefaultSymbolType, BitmaskType > symbolBitmaskLookupTable;
// Initialize the bitmasks with zeros for each symbol in the alphabet
for ( const auto & symbol : pattern.getAlphabet ( ) )
symbolBitmaskLookupTable[symbol] = BitmaskType ( 0 );
// Mark the position in the bitmask for each symbol in the pattern
for ( size_t i = 0; i < bitmaskLength; i++ )
symbolBitmaskLookupTable[pattern.getContent ( ).at ( i )].set ( bitmaskLength - i - 1 );
measurements::end ( );
measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
size_t posInSubject = 0;
BitmaskType currentBitmask;
while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
size_t posInPattern = bitmaskLength;
size_t lastPosOfFactor = bitmaskLength;
// Set the bitmask to all ones
currentBitmask.set ( );
while ( posInPattern > 0 && currentBitmask.any ( ) ) {
currentBitmask &= symbolBitmaskLookupTable[subject.getContent ( ).at ( posInSubject + posInPattern - 1 )];
posInPattern--;
// Test whether the most significant bit is set
if ( currentBitmask.test ( bitmaskLength - 1 ) ) {
if ( posInPattern > 0 ) {
lastPosOfFactor = posInPattern;
} else {
if ( !patternIsLong ) {
// Yay, there is match!!!
occ.insert ( posInSubject );
} else {
// if the pattern is longer then BitmaskBitCount characters switch to brute force check
size_t k = bitmaskLength;
while ( k < pattern.getContent ( ).size ( ) && pattern.getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++;
if ( k == pattern.getContent ( ).size ( ) )
// Yay, there is match!!!
occ.insert ( posInSubject );
}
}
}
currentBitmask <<= 1;
}
posInSubject += lastPosOfFactor;
}
measurements::end ( );
return occ;
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::match32 ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match32 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 32 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match64 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 64 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::String & subject, const string::String & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}
std::set < unsigned > BackwardNondeterministicDAWGMatching::match128 ( const string::LinearString < > & subject, const string::LinearString < > & pattern ) {
return BackwardNondeterministicDAWGMatching::matchTemplate < 128 > ( subject, pattern );
}
auto BackwardNondeterministicDAWGMatchingLinearStringLinearString = BackwardNondeterministicDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString < >, string::LinearString < > > ( BackwardNondeterministicDAWGMatching::match );
} /* namespace exact */
} /* namespace stringology */
/*
* Author: Radovan Cerveny
*/
#ifndef STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__
#define STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__
#include <string/String.h>
#include <string/StringFeatures.h>
#include <core/multipleDispatch.hpp>
#include <set>
namespace stringology {
namespace exact {
/**
* Implementation of Backward Nondeterministic DAWG Matching using bit parallelism with 32/64/128bit bitmask and brute force switch for longer patterns.
*/
class BackwardNondeterministicDAWGMatching : public std::DoubleDispatch < BackwardNondeterministicDAWGMatching, std::set < unsigned >, const string::StringBase &, const string::StringBase & > {
private:
/**
* Search for pattern in linear string.
* @return set set of occurences
*/
template <size_t BitmaskBitCount>
static std::set < unsigned > matchTemplate ( const string::String & subject, const string::String & pattern );
template <size_t BitmaskBitCount >
static std::set < unsigned > matchTemplate ( const string::LinearString < > & subject, const string::LinearString < > & pattern );
public:
// Defaults to 32 bits
static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
static std::set < unsigned > match ( const string::LinearString < > & subject, const string::LinearString < > & pattern );
static std::set < unsigned > match32 ( const string::String & subject, const string::String & pattern );
static std::set < unsigned > match32 ( const string::LinearString < > & subject, const string::LinearString < > & pattern );
static std::set < unsigned > match64 ( const string::String & subject, const string::String & pattern );
static std::set < unsigned > match64 ( const string::LinearString < > & subject, const string::LinearString < > & pattern );
static std::set < unsigned > match128 ( const string::String & subject, const string::String & pattern );
static std::set < unsigned > match128 ( const string::LinearString < > & subject, const string::LinearString < > & pattern );
};
} /* namespace exact */
} /* namespace stringology */
#endif /* STRINGOLOGY_BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_HPP__ */
#include "BackwardNondeterministicDAWGMatchingTest.h"
#include "string/String.h"
#include "stringology/exact/BackwardNondeterministicDAWGMatching.hpp"
#include "stringology/exact/ExactFactorMatch.h"
#include "string/generate/RandomStringFactory.h"
#include "string/generate/RandomSubstringFactory.h"
CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest, "stringology" );
CPPUNIT_TEST_SUITE_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest );
void BackwardNondeterministicDAWGMatchingTest::setUp ( ) {
}
void BackwardNondeterministicDAWGMatchingTest::tearDown ( ) {
}
void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) {
std::vector<std::string> subjects;
std::vector<std::string> patterns;
std::vector<std::set<unsigned>> expectedOccs;
subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0});
subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({});
subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0});
subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({});
subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({});
subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0});
subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({});
subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0});
subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8});
subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9});
for(size_t i = 0; i < subjects.size(); ++i) {
string::String subject = string::stringFrom ( subjects[i] );
string::String pattern = string::stringFrom ( patterns[i] );
std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern );
CPPUNIT_ASSERT ( res == expectedOccs[i] );
res = stringology::exact::BackwardNondeterministicDAWGMatching::match64 ( subject, pattern );
CPPUNIT_ASSERT ( res == expectedOccs[i] );
res = stringology::exact::BackwardNondeterministicDAWGMatching::match128 ( subject, pattern );
CPPUNIT_ASSERT ( res == expectedOccs[i] );
std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
}
auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true);
auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject );
std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( longSubject, longPattern );
std::set < unsigned > resRef = stringology::exact::ExactFactorMatch::match ( longSubject, longPattern );
std::cout << "long: " << res << std::endl;
CPPUNIT_ASSERT ( res == resRef);
}
#ifndef BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_
#define BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_
#include <cppunit/extensions/HelperMacros.h>
class BackwardNondeterministicDAWGMatchingTest : public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE ( BackwardNondeterministicDAWGMatchingTest );
CPPUNIT_TEST ( testBNDM );
CPPUNIT_TEST_SUITE_END ( );
public:
void setUp ( );
void tearDown ( );
void testBNDM ( );
};
#endif // BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment