From 2af48bbe9e6bb36f2db30ac4d0990c17ac9784e1 Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Mon, 1 May 2017 23:29:56 +0200 Subject: [PATCH] BNDM matching from experimental --- .../matching/BNDMMatcherConstruction.cpp | 24 ++ .../matching/BNDMMatcherConstruction.h | 58 +++++ .../matching/WideBNDMMatcherConstruction.cpp | 24 ++ .../matching/WideBNDMMatcherConstruction.h | 56 ++++ .../src/stringology/query/BNDMOccurrences.cpp | 24 ++ .../src/stringology/query/BNDMOccurrences.h | 101 ++++++++ .../stringology/query/WideBNDMOccurrences.cpp | 24 ++ .../stringology/query/WideBNDMOccurrences.h | 94 +++++++ ...ckwardNondeterministicDAWGMatchingTest.cpp | 58 +++++ ...BackwardNondeterministicDAWGMatchingTest.h | 18 ++ .../src/indexes/stringology/BNDMMatcher.cpp | 14 + .../src/indexes/stringology/BNDMMatcher.h | 243 ++++++++++++++++++ alib2std/src/extensions/vector.hpp | 29 ++- astringology2/src/astringology.cpp | 24 +- tests.astringology.sh | 1 + 15 files changed, 780 insertions(+), 12 deletions(-) create mode 100644 alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp create mode 100644 alib2algo/src/stringology/matching/BNDMMatcherConstruction.h create mode 100644 alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.cpp create mode 100644 alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.h create mode 100644 alib2algo/src/stringology/query/BNDMOccurrences.cpp create mode 100644 alib2algo/src/stringology/query/BNDMOccurrences.h create mode 100644 alib2algo/src/stringology/query/WideBNDMOccurrences.cpp create mode 100644 alib2algo/src/stringology/query/WideBNDMOccurrences.h create mode 100644 alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp create mode 100644 alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.h create mode 100644 alib2data/src/indexes/stringology/BNDMMatcher.cpp create mode 100644 alib2data/src/indexes/stringology/BNDMMatcher.h diff --git a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp new file mode 100644 index 0000000000..01c752df57 --- /dev/null +++ b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp @@ -0,0 +1,24 @@ +/* + * BNDMMatcherConstruction.cpp + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#include "BNDMMatcherConstruction.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace matching { + +indexes::stringology::BNDMMatcher < DefaultSymbolType > BNDMMatcherConstruction::construct ( const string::String & string ) { + return dispatch ( string.getData ( ) ); +} + +auto BNDMIndexConstructionLinearString = BNDMMatcherConstruction::RegistratorWrapper < indexes::stringology::BNDMMatcher < >, string::LinearString < > > ( BNDMMatcherConstruction::construct ); + +} /* namespace matching */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h new file mode 100644 index 0000000000..974ff4910b --- /dev/null +++ b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h @@ -0,0 +1,58 @@ +/* + * BNDMMatcherConstruction.h + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#ifndef BNDM_MATCHER_CONSTRUCTION_H_ +#define BNDM_MATCHER_CONSTRUCTION_H_ + +#include <indexes/stringology/BNDMMatcher.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <exception/CommonException.h> + +namespace stringology { + +namespace matching { + +/** + * Constructs a bit parallel index for given string. + * + */ + +class BNDMMatcherConstruction : public std::SingleDispatch < BNDMMatcherConstruction, indexes::stringology::BNDMMatcher < >, const string::StringBase & > { +public: + /** + * Creates suffix trie + * @param string string to construct suffix trie for + * @return automaton + */ + static indexes::stringology::BNDMMatcher < > construct ( const string::String & string ); + + template < class SymbolType, size_t BitmaskBitCount = 64 > + static indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > construct ( const string::LinearString < SymbolType > & string ); + +}; + +template < class SymbolType, size_t BitmaskBitCount > +indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > BNDMMatcherConstruction::construct ( const string::LinearString < SymbolType > & w ) { + size_t bitmaskLength = std::min ( w.getContent ( ).size ( ), BitmaskBitCount ); + + std::map < SymbolType, std::bitset < BitmaskBitCount > > res; + for ( const SymbolType & symbol : w.getAlphabet ( ) ) + res [ symbol ] = std::bitset < BitmaskBitCount > ( 0 ); + + for ( unsigned i = 0; i < bitmaskLength; ++i ) + res [ w.getContent ( ) [ i ] ] [ bitmaskLength - i - 1 ] = true; + + return indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > ( w.getAlphabet ( ), res, w.getContent ( ) ); +} + +} /* namespace matching */ + +} /* namespace stringology */ + +#endif /* BNDM_MATCHER_CONSTRUCTION_H_ */ diff --git a/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.cpp b/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.cpp new file mode 100644 index 0000000000..1357d72957 --- /dev/null +++ b/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.cpp @@ -0,0 +1,24 @@ +/* + * WideBNDMMatcherConstruction.cpp + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#include "WideBNDMMatcherConstruction.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace matching { + +indexes::stringology::BitParallelIndex < DefaultSymbolType > WideBNDMMatcherConstruction::construct ( const string::String & string ) { + return dispatch ( string.getData ( ) ); +} + +auto WideBNDMIndexConstructionLinearString = WideBNDMMatcherConstruction::RegistratorWrapper < indexes::stringology::BitParallelIndex < DefaultSymbolType >, string::LinearString < > > ( WideBNDMMatcherConstruction::construct ); + +} /* namespace matching */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.h b/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.h new file mode 100644 index 0000000000..f5ccf345af --- /dev/null +++ b/alib2algo/src/stringology/matching/WideBNDMMatcherConstruction.h @@ -0,0 +1,56 @@ +/* + * WideBNDMMatcherConstruction.h + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#ifndef WIDE_BNDM_MATCHER_CONSTRUCTION_H_ +#define WIDE_BNDM_MATCHER_CONSTRUCTION_H_ + +#include <indexes/stringology/BitParallelIndex.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <exception/CommonException.h> + +namespace stringology { + +namespace matching { + +/** + * Constructs a bit parallel index for given string. + * + */ + +class WideBNDMMatcherConstruction : public std::SingleDispatch < WideBNDMMatcherConstruction, indexes::stringology::BitParallelIndex < DefaultSymbolType >, const string::StringBase & > { +public: + /** + * Creates suffix trie + * @param string string to construct suffix trie for + * @return automaton + */ + static indexes::stringology::BitParallelIndex < DefaultSymbolType > construct ( const string::String & string ); + + template < class SymbolType > + static indexes::stringology::BitParallelIndex < SymbolType > construct ( const string::LinearString < SymbolType > & string ); + +}; + +template < class SymbolType > +indexes::stringology::BitParallelIndex < SymbolType > WideBNDMMatcherConstruction::construct ( const string::LinearString < SymbolType > & w ) { + std::map < SymbolType, std::vector < bool > > res; + for ( const SymbolType & symbol : w.getAlphabet ( ) ) + res [ symbol ].resize ( w.getContent ( ).size ( ) ); + + for ( unsigned i = 0; i < w.getContent ( ).size ( ); ++i ) + res [ w.getContent ( ) [ i ] ] [ w.getContent ( ).size ( ) - i - 1 ] = true; + + return indexes::stringology::BitParallelIndex < SymbolType > ( w.getAlphabet ( ), res ); +} + +} /* namespace matching */ + +} /* namespace stringology */ + +#endif /* WIDE_BNDM_MATCHER_CONSTRUCTION_H_ */ diff --git a/alib2algo/src/stringology/query/BNDMOccurrences.cpp b/alib2algo/src/stringology/query/BNDMOccurrences.cpp new file mode 100644 index 0000000000..d184d80aa4 --- /dev/null +++ b/alib2algo/src/stringology/query/BNDMOccurrences.cpp @@ -0,0 +1,24 @@ +/* + * BNDMOccurrences.cpp + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#include "BNDMOccurrences.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace query { + +std::set < unsigned > BNDMOccurrences::query ( const indexes::stringology::BNDMMatcher < > & pattern, const string::String & subject ) { + return dispatch ( pattern, subject.getData ( ) ); +} + +auto bndmOccurrencesLinearString = BNDMOccurrences::RegistratorWrapper < std::set < unsigned >, string::LinearString < > > ( BNDMOccurrences::query ); + +} /* namespace query */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/query/BNDMOccurrences.h b/alib2algo/src/stringology/query/BNDMOccurrences.h new file mode 100644 index 0000000000..77252196b8 --- /dev/null +++ b/alib2algo/src/stringology/query/BNDMOccurrences.h @@ -0,0 +1,101 @@ +/* + * BNDMOccurrences.h + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#ifndef BNDM_OCCURRENCES_H_ +#define BNDM_OCCURRENCES_H_ + +#include <indexes/stringology/BNDMMatcher.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <global/GlobalData.h> + +#include <foreach> + +namespace stringology { + +namespace query { + +/** + * Based on backward nondeterministic dawg matching. + * + */ + +class BNDMOccurrences : public std::SingleDispatchFirstStaticParam < BNDMOccurrences, std::set < unsigned >, const indexes::stringology::BNDMMatcher < > &, const string::StringBase & > { + +public: + /** + * Query a suffix trie + * @param suffix trie to query + * @param string string to query by + * @return occurences of factors + */ + static std::set < unsigned > query ( const indexes::stringology::BNDMMatcher < DefaultSymbolType > & pattern, const string::String & subject ); + + template < class SymbolType, size_t BitmaskBitCount > + static std::set < unsigned > query ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ); + +}; + +template < class SymbolType, size_t BitmaskBitCount > +std::set < unsigned > BNDMOccurrences::query ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ) { + + std::set < unsigned > occ; + + size_t patternLength = pattern.getString ( ).size ( ); + size_t subjectLength = subject.getContent ( ).size ( ); + size_t posInSubject = 0; + size_t bitmaskLength = std::min ( BitmaskBitCount, patternLength ); + + std::bitset < BitmaskBitCount > currentBitmask; + + while ( posInSubject <= subjectLength - patternLength ) { + size_t posInPattern = bitmaskLength; + size_t lastPosOfFactor = bitmaskLength; + + // Set the bitmask to all ones + currentBitmask.set ( ); + + while ( posInPattern > 0 && currentBitmask.any ( ) ) { + typename std::map < SymbolType, std::bitset < BitmaskBitCount > >::const_iterator symbolVectorIter = pattern.getData ( ).find ( subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) ); + if ( symbolVectorIter == pattern.getData ( ).end ( ) ) + break; + + currentBitmask &= symbolVectorIter->second; + posInPattern--; + + // Test whether the most significant bit is set + if ( currentBitmask.test ( bitmaskLength - 1 ) ) { + // and we didn't process all symbols of the pattern + if ( posInPattern > 0 ) + lastPosOfFactor = posInPattern; + else { + size_t k = bitmaskLength; + + // out of bitset fallback to naive checking of occurrence here + while ( k < patternLength && pattern.getString ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; + + if ( k == patternLength ) + // Yay, there is match!!! + occ.insert ( posInSubject ); + } + } + + currentBitmask <<= 1; + } + + posInSubject += lastPosOfFactor; + } + + return occ; +} + +} /* namespace query */ + +} /* namespace stringology */ + +#endif /* BNDM_OCCURRENCES_H_ */ diff --git a/alib2algo/src/stringology/query/WideBNDMOccurrences.cpp b/alib2algo/src/stringology/query/WideBNDMOccurrences.cpp new file mode 100644 index 0000000000..e028f64898 --- /dev/null +++ b/alib2algo/src/stringology/query/WideBNDMOccurrences.cpp @@ -0,0 +1,24 @@ +/* + * WideBNDMOccurrences.cpp + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#include "WideBNDMOccurrences.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace query { + +std::set < unsigned > WideBNDMOccurrences::query ( const indexes::stringology::BitParallelIndex < DefaultSymbolType > & pattern, const string::String & subject ) { + return dispatch ( pattern, subject.getData ( ) ); +} + +auto wideBNDMOccurrencesLinearString = WideBNDMOccurrences::RegistratorWrapper < std::set < unsigned >, string::LinearString < > > ( WideBNDMOccurrences::query ); + +} /* namespace query */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/query/WideBNDMOccurrences.h b/alib2algo/src/stringology/query/WideBNDMOccurrences.h new file mode 100644 index 0000000000..94f632d9cc --- /dev/null +++ b/alib2algo/src/stringology/query/WideBNDMOccurrences.h @@ -0,0 +1,94 @@ +/* + * WideBNDMOccurrences.h + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#ifndef WIDE_BNDM_OCCURRENCES_H_ +#define WIDE_BNDM_OCCURRENCES_H_ + +#include <indexes/stringology/BitParallelIndex.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <global/GlobalData.h> + +#include <foreach> + +namespace stringology { + +namespace query { + +/** + * Based on backward nondeterministic dawg matching. + * + */ + +class WideBNDMOccurrences : public std::SingleDispatchFirstStaticParam < WideBNDMOccurrences, std::set < unsigned >, const indexes::stringology::BitParallelIndex < DefaultSymbolType > &, const string::StringBase & > { + +public: + /** + * Query a suffix trie + * @param suffix trie to query + * @param string string to query by + * @return occurences of factors + */ + static std::set < unsigned > query ( const indexes::stringology::BitParallelIndex < DefaultSymbolType > & pattern, const string::String & subject ); + + template < class SymbolType > + static std::set < unsigned > query ( const indexes::stringology::BitParallelIndex < SymbolType > & pattern, const string::LinearString < SymbolType > & subject ); + +}; + +template < class SymbolType > +std::set < unsigned > WideBNDMOccurrences::query ( const indexes::stringology::BitParallelIndex < SymbolType > & pattern, const string::LinearString < SymbolType > & subject ) { + + std::set < unsigned > occ; + + size_t patternLength = pattern.getData ( ).begin ( )->second.size ( ); + size_t subjectLength = subject.getContent ( ).size ( ); + size_t posInSubject = 0; + + std::vector < bool > currentBitmask; + currentBitmask.resize ( patternLength ); + + while ( posInSubject <= subjectLength - patternLength ) { + size_t posInPattern = patternLength; + size_t lastPosOfFactor = patternLength; + + // Set the bitmask to all ones + std::fill ( currentBitmask ); + + while ( posInPattern > 0 && std::any ( currentBitmask ) ) { + typename std::map < SymbolType, std::vector < bool > >::const_iterator symbolVectorIter = pattern.getData ( ).find ( subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) ); + if ( symbolVectorIter == pattern.getData ( ).end ( ) ) + break; + + currentBitmask &= symbolVectorIter->second; + posInPattern--; + + // Test whether the most significant bit is set + if ( currentBitmask [ patternLength - 1 ] ) { + // and we didn't process all symbols of the pattern + if ( posInPattern > 0 ) + lastPosOfFactor = posInPattern; + else /* posInPattern == 0 */ + // Yay, there is match!!! + occ.insert ( posInSubject ); + } + + currentBitmask <<= 1; + } + + posInSubject += lastPosOfFactor; + } + + return occ; +} + +} /* namespace query */ + +} /* namespace stringology */ + +#endif /* WIDE_BNDM_OCCURRENCES_H_ */ diff --git a/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp new file mode 100644 index 0000000000..e07b3baaf8 --- /dev/null +++ b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp @@ -0,0 +1,58 @@ +#include "BackwardNondeterministicDAWGMatchingTest.h" + +#include <string/String.h> +#include <stringology/matching/WideBNDMMatcherConstruction.h> +#include <stringology/query/WideBNDMOccurrences.h> +#include <stringology/matching/BNDMMatcherConstruction.h> +#include <stringology/query/BNDMOccurrences.h> +#include <stringology/exact/ExactFactorMatch.h> + +#include <string/generate/RandomStringFactory.h> +#include <string/generate/RandomSubstringFactory.h> + +#include <primitive/Character.h> + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( BackwardNondeterministicDAWGMatchingTest ); + +void BackwardNondeterministicDAWGMatchingTest::setUp ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::tearDown ( ) { +} + +void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { + + std::vector<std::string> subjects; + std::vector<std::string> patterns; + std::vector<std::set<unsigned>> expectedOccs; + + subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0}); + subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0}); + subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({}); + subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0}); + subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8}); + subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9}); + + for(size_t i = 0; i < subjects.size(); ++i) { + indexes::stringology::BitParallelIndex < char > bndmPattern1 = stringology::matching::WideBNDMMatcherConstruction::construct ( string::LinearString < char > ( patterns[i] ) ); + indexes::stringology::BNDMMatcher < char > bndmPattern2 = stringology::matching::BNDMMatcherConstruction::construct ( string::LinearString < char > ( patterns[i] ) ); + std::set < unsigned > res1 = stringology::query::WideBNDMOccurrences::query ( bndmPattern1, string::LinearString < char > ( subjects[i] ) ); + std::set < unsigned > res2 = stringology::query::BNDMOccurrences::query ( bndmPattern2, string::LinearString < char > ( subjects[i] ) ); + CPPUNIT_ASSERT ( res1 == expectedOccs[i] ); + CPPUNIT_ASSERT ( res2 == expectedOccs[i] ); + std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res1 << std::endl; + } + + auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); + auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); + indexes::stringology::BNDMMatcher < > pattern = stringology::matching::BNDMMatcherConstruction::construct ( longPattern ); + std::set < unsigned > res = stringology::query::BNDMOccurrences::query ( pattern, longSubject ); + std::set < unsigned > ref = stringology::exact::ExactFactorMatch::match ( longSubject, longPattern ); + std::cout << "long: " << res << std::endl; + CPPUNIT_ASSERT ( res == ref); +} diff --git a/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.h b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.h new file mode 100644 index 0000000000..9d50ac034b --- /dev/null +++ b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.h @@ -0,0 +1,18 @@ +#ifndef BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ +#define BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class BackwardNondeterministicDAWGMatchingTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE ( BackwardNondeterministicDAWGMatchingTest ); + CPPUNIT_TEST ( testBNDM ); + CPPUNIT_TEST_SUITE_END ( ); + +public: + void setUp ( ); + void tearDown ( ); + + void testBNDM ( ); +}; + +#endif // BACKWARD_NONDETERMINISTIC_DAWG_MATCHING_TEST_H_ diff --git a/alib2data/src/indexes/stringology/BNDMMatcher.cpp b/alib2data/src/indexes/stringology/BNDMMatcher.cpp new file mode 100644 index 0000000000..091929d0e4 --- /dev/null +++ b/alib2data/src/indexes/stringology/BNDMMatcher.cpp @@ -0,0 +1,14 @@ +/* + * BNDMMatcher.cpp + * + * Created on: Jan 8, 2017 + * Author: Jan Travnicek + */ + +#include "BNDMMatcher.h" + +namespace alib { + +auto bndmMatcherParserRegister = xmlApi < alib::Object >::ParserRegister < indexes::stringology::BNDMMatcher < > > ( ); + +} /* namespace alib */ diff --git a/alib2data/src/indexes/stringology/BNDMMatcher.h b/alib2data/src/indexes/stringology/BNDMMatcher.h new file mode 100644 index 0000000000..ea82fa48a5 --- /dev/null +++ b/alib2data/src/indexes/stringology/BNDMMatcher.h @@ -0,0 +1,243 @@ +/* + * BNDMMatcher.h + * + * Created on: Jan 8, 2017 + * Author: Jan Travnicek + */ + +#ifndef BNDM_MATCHER_H_ +#define BNDM_MATCHER_H_ + +#include <string> +#include <iostream> +#include <sstream> + +#include <common/DefaultSymbolType.h> + +#include <core/components.hpp> +#include <exception/CommonException.h> + +#include <object/UniqueObject.h> +#include <object/ObjectBase.h> + +#include <sax/FromXMLParserHelper.h> +#include <core/xmlApi.hpp> + +#include <container/ObjectsSet.h> +#include <container/ObjectsMap.h> +#include <container/ObjectsVector.h> +#include <container/ObjectsBitset.h> +#include <primitive/Bool.h> +#include <bitset> + +#include <alphabet/common/SymbolNormalize.h> + +namespace indexes { + +namespace stringology { + +class GeneralAlphabet; + +/** + * Represents regular expression parsed from the XML. Regular expression is stored + * as a tree of RegExpElement. + */ +template < class SymbolType = DefaultSymbolType, size_t BitmaskBitCount = 64 > +class BNDMMatcher : public alib::ObjectBase, public std::Components < BNDMMatcher < SymbolType >, SymbolType, std::tuple < GeneralAlphabet >, std::tuple < > > { +protected: + std::map < SymbolType, std::bitset < BitmaskBitCount > > m_vectors; + std::vector < SymbolType > m_string; + +public: + /** + * @copydoc SuffixTrieNode::clone() const + */ + virtual ObjectBase * clone ( ) const; + + /** + * @copydoc SuffixTrieNode::plunder() const + */ + virtual ObjectBase * plunder ( ) &&; + + explicit BNDMMatcher ( std::set < SymbolType > alphabet, std::map < SymbolType, std::bitset < BitmaskBitCount > > vectors, std::vector < SymbolType > string ); + + /** + * @return Root node of the trie + */ + const std::map < SymbolType, std::bitset < BitmaskBitCount > > & getData ( ) const; + + const std::vector < SymbolType > & getString ( ) const; + + const std::set < SymbolType > & getAlphabet ( ) const { + return this->template accessComponent < GeneralAlphabet > ( ).get ( ); + } + + /** + * Sets the bit vector for given symbol + * @param tree root node to set + */ + void setBitVectorForSymbol ( SymbolType symbol, std::bitset < BitmaskBitCount > data ); + + /** + * Removes symbol from the alphabet of symbol available in the regular expression + * @param symbol removed symbol from the alphabet + */ + bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { + return this->template accessComponent < GeneralAlphabet > ( ).remove ( symbol ); + } + + /** + * Prints XML representation of the tree to the output stream. + * @param out output stream to which print the tree + * @param tree tree to print + */ + virtual void operator >>( std::ostream & out ) const; + + virtual int compare ( const ObjectBase & other ) const { + if ( std::type_index ( typeid ( * this ) ) == std::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); + + return std::type_index ( typeid ( * this ) ) - std::type_index ( typeid ( other ) ); + } + + virtual int compare ( const BNDMMatcher & other ) const; + + virtual explicit operator std::string ( ) const; + + static const std::string & getXmlTagName() { + static std::string xmlTagName = "BNDMMatcher"; + + return xmlTagName; + } + + static BNDMMatcher parse ( std::deque < sax::Token >::iterator & input ); + + void compose ( std::deque < sax::Token > & out ) const; + + virtual alib::ObjectBase * inc ( ) &&; + + virtual ObjectBase * normalize ( ) && { + if ( typeid ( BNDMMatcher < > ) == typeid ( BNDMMatcher < SymbolType > ) ) + return this; + + std::set < DefaultSymbolType > alphabet = alphabet::SymbolNormalize::normalizeAlphabet ( std::move ( this->template accessComponent < GeneralAlphabet > ( ).get ( ) ) ); + + std::map < DefaultSymbolType, std::bitset < BitmaskBitCount > > vectors; + for ( std::pair < SymbolType, std::bitset < BitmaskBitCount > > && vector : std::make_moveable_map ( m_vectors ) ) + vectors.insert ( std::make_pair ( alphabet::SymbolNormalize::normalizeSymbol ( std::move ( vector.first ) ), std::move ( vector.second ) ) ); + + std::vector < DefaultSymbolType > string = alphabet::SymbolNormalize::normalizeSymbols ( std::move ( m_string ) ); + + return new BNDMMatcher < > ( std::move ( alphabet ), std::move ( vectors ), std::move ( string ) ); + } +}; + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace indexes { + +namespace stringology { + +template < class SymbolType, size_t BitmaskBitCount > +BNDMMatcher < SymbolType, BitmaskBitCount >::BNDMMatcher ( std::set < SymbolType > alphabet, std::map < SymbolType, std::bitset < BitmaskBitCount > > vectors, std::vector < SymbolType > string ) : std::Components < BNDMMatcher, SymbolType, std::tuple < GeneralAlphabet >, std::tuple < > > ( std::make_tuple ( std::move ( alphabet ) ), std::tuple < > ( ) ), m_vectors ( std::move ( vectors ) ), m_string ( std::move ( string ) ) { +} + +template < class SymbolType, size_t BitmaskBitCount > +alib::ObjectBase * BNDMMatcher < SymbolType, BitmaskBitCount >::clone ( ) const { + return new BNDMMatcher ( * this ); +} + +template < class SymbolType, size_t BitmaskBitCount > +alib::ObjectBase * BNDMMatcher < SymbolType, BitmaskBitCount >::plunder ( ) && { + return new BNDMMatcher ( std::move ( * this ) ); +} + +template < class SymbolType, size_t BitmaskBitCount > +const std::map < SymbolType, std::bitset < BitmaskBitCount > > & BNDMMatcher < SymbolType, BitmaskBitCount >::getData ( ) const { + return m_vectors; +} + +template < class SymbolType, size_t BitmaskBitCount > +const std::vector < SymbolType > & BNDMMatcher < SymbolType, BitmaskBitCount >::getString ( ) const { + return m_string; +} + +template < class SymbolType, size_t BitmaskBitCount > +void BNDMMatcher < SymbolType, BitmaskBitCount >::setBitVectorForSymbol ( SymbolType symbol, std::bitset < BitmaskBitCount > data ) { + this->m_vectors [ symbol ] = std::move ( data ); +} + +template < class SymbolType, size_t BitmaskBitCount > +void BNDMMatcher < SymbolType, BitmaskBitCount >::operator >>( std::ostream & out ) const { + out << "(BNDMMatcher " << this->m_vectors << ")"; +} + +template < class SymbolType, size_t BitmaskBitCount > +int BNDMMatcher < SymbolType, BitmaskBitCount >::compare ( const BNDMMatcher & other ) const { + auto first = std::tie ( getData ( ), getAlphabet ( ) ); + auto second = std::tie ( other.getData ( ), other.getAlphabet ( ) ); + + static std::compare < decltype ( first ) > comp; + + return comp ( first, second ); +} + +template < class SymbolType, size_t BitmaskBitCount > +BNDMMatcher < SymbolType, BitmaskBitCount >::operator std::string ( ) const { + std::stringstream ss; + ss << * this; + return ss.str ( ); +} + +template < class SymbolType, size_t BitmaskBitCount > +BNDMMatcher < SymbolType, BitmaskBitCount > BNDMMatcher < SymbolType, BitmaskBitCount >::parse ( std::deque < sax::Token >::iterator & input ) { + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::START_ELEMENT, BNDMMatcher::getXmlTagName() ); + std::set < SymbolType > alphabet = alib::xmlApi < std::set < SymbolType > >::parse ( input ); + std::map < SymbolType, std::bitset < BitmaskBitCount > > data = alib::xmlApi < std::map < SymbolType, std::bitset < BitmaskBitCount > > >::parse ( input ); + std::vector < SymbolType > string = alib::xmlApi < std::vector < SymbolType > >::parse ( input ); + BNDMMatcher < SymbolType, BitmaskBitCount > res ( std::move ( alphabet ), std::move ( data ), std::move ( string ) ); + + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::END_ELEMENT, BNDMMatcher::getXmlTagName() ); + return res; +} + +template < class SymbolType, size_t BitmaskBitCount > +void BNDMMatcher < SymbolType, BitmaskBitCount >::compose ( std::deque < sax::Token > & out ) const { + out.emplace_back ( BNDMMatcher::getXmlTagName(), sax::Token::TokenType::START_ELEMENT ); + alib::xmlApi < std::set < SymbolType > >::compose ( out, getAlphabet ( ) ); + alib::xmlApi < std::map < SymbolType, std::bitset < BitmaskBitCount > > >::compose ( out, getData ( ) ); + alib::xmlApi < std::vector < SymbolType > >::compose ( out, getString ( ) ); + out.emplace_back ( BNDMMatcher::getXmlTagName(), sax::Token::TokenType::END_ELEMENT ); +} + +template < class SymbolType, size_t BitmaskBitCount > +alib::ObjectBase* BNDMMatcher < SymbolType, BitmaskBitCount >::inc() && { + return new alib::UniqueObject(alib::Object(std::move(*this)), primitive::Integer(0)); +} + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace std { + +template < class SymbolType, size_t BitmaskBitCount > +class ComponentConstraint < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount >, SymbolType, indexes::stringology::GeneralAlphabet > { +public: + static bool used ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & index, const SymbolType & symbol ) { + const std::map < SymbolType, std::bitset < BitmaskBitCount > > & content = index.getData ( ); + return content.find( symbol ) != content.end(); + } + + static bool available ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > &, const SymbolType & ) { + return true; + } + + static void valid ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > &, const SymbolType & ) { + } +}; + +} /* namespace std */ + +#endif /* BNDM_MATCHER_H_ */ diff --git a/alib2std/src/extensions/vector.hpp b/alib2std/src/extensions/vector.hpp index 52c4af268c..09bb51c568 100644 --- a/alib2std/src/extensions/vector.hpp +++ b/alib2std/src/extensions/vector.hpp @@ -218,9 +218,8 @@ vector < bool, Ts ... > & operator >>= ( vector < bool, Ts ... > & A, size_t dis auto itAReverse = A.end ( ) - 1; // upper part of the last word in the vector can contain some garbage so it needs to be cleared - vectorBoolInternalType maskTopWord = getMask ( sizeWithin ); - if ( maskTopWord != 0 ) - * ( itAReverse._M_p ) &= maskTopWord; + if ( sizeWithin != 0 ) + * ( itAReverse._M_p ) &= getMask ( sizeWithin ); // simulate behavior of reverse iterator while ( itAReverse >= A.begin ( ) ) { @@ -254,10 +253,26 @@ bool any ( const vector < bool, Ts ... > & v ) { if ( sizeWithin == 0 ) return * itV._M_p != 0; - else { - vectorBoolInternalType maskTopWord = getMask ( sizeWithin ); - return ( * itV._M_p & maskTopWord ) != 0; - } + else + return ( * itV._M_p & getMask ( sizeWithin ) ) != 0; +} + +template < class ... Ts > +void fill ( const vector < bool, Ts ... > & v ) { + typename vector < bool, Ts ... >::const_iterator itV = v.begin ( ); + + // c++ implementation-specific + while ( itV < v.end ( ) ) + * ( itV._M_p ++ ) = ~ vectorBoolInternalType { }; +} + +template < class ... Ts > +void clear ( const vector < bool, Ts ... > & v ) { + typename vector < bool, Ts ... >::const_iterator itV = v.begin ( ); + + // c++ implementation-specific + while ( itV < v.end ( ) ) + * ( itV._M_p ++ ) = 0; } } /* namespace std */ diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp index 9d2f3fa84e..4c770213af 100644 --- a/astringology2/src/astringology.cpp +++ b/astringology2/src/astringology.cpp @@ -24,7 +24,7 @@ #include <stringology/exact/BoyerMoore.h> #include <stringology/exact/ReversedBoyerMooreHorspool.h> #include <stringology/exact/DeadZoneUsingBadCharacterShift.h> -#include <stringology/exact/BackwardNondeterministicDAWGMatching.hpp> +#include <stringology/query/BNDMOccurrences.h> #include <stringology/exact/BackwardOracleMatching.h> #include <stringology/exact/BackwardDAWGMatching.h> #include <stringology/exact/ExactMatchingAutomaton.h> @@ -39,6 +39,7 @@ #include <stringology/indexing/PositionHeapNaive.h> #include <stringology/indexing/SuffixArrayNaive.h> #include <stringology/indexing/BitParallelIndexConstruction.h> +#include <stringology/matching/BNDMMatcherConstruction.h> #include <stringology/indexing/CompressedBitParallelIndexConstruction.h> int main ( int argc, char * argv[] ) { @@ -61,12 +62,13 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "boyerMoore" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); allowed.push_back ( "deadZoneUsingBadCharacterShift" ); - allowed.push_back ( "backwardNondeterministicDAWGMatching" ); + allowed.push_back ( "bndmOccurrences" ); allowed.push_back ( "backwardOracleMatching" ); allowed.push_back ( "backwardDAWGMatching" ); allowed.push_back ( "suffixTrie" ); allowed.push_back ( "positionHeap" ); allowed.push_back ( "bitParallelIndex" ); + allowed.push_back ( "bndmMatcher" ); allowed.push_back ( "compressedBitParallelIndex" ); allowed.push_back ( "suffixArray" ); @@ -165,14 +167,14 @@ int main ( int argc, char * argv[] ) { measurements::start ( "Output write", measurements::Type::AUXILIARY ); alib::XmlDataFactory::toStdout ( res ); - } else if ( algorithm.getValue ( ) == "backwardNondeterministicDAWGMatching" ) { + } else if ( algorithm.getValue ( ) == "bndmOccurrences" ) { string::String subject = alib::XmlDataFactory::fromTokens ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); - string::String pattern = alib::XmlDataFactory::fromTokens ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + indexes::stringology::BNDMMatcher < DefaultSymbolType > pattern = alib::XmlDataFactory::fromTokens ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); measurements::end ( ); measurements::start ( "Algorithm", measurements::Type::MAIN ); - std::set < unsigned > res = stringology::exact::BackwardNondeterministicDAWGMatching::match ( subject, pattern ); + std::set < unsigned > res = stringology::query::BNDMOccurrences::query ( pattern, subject ); measurements::end ( ); measurements::start ( "Output write", measurements::Type::AUXILIARY ); @@ -341,6 +343,18 @@ int main ( int argc, char * argv[] ) { measurements::start ( "Output write", measurements::Type::AUXILIARY ); alib::XmlDataFactory::toStdout ( bitParallelIndex ); + } else if ( algorithm.getValue ( ) == "bndmMatcher" ) { + string::String pattern = alib::XmlDataFactory::fromTokens ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + indexes::stringology::BNDMMatcher < DefaultSymbolType > bndmMatcher = stringology::matching::BNDMMatcherConstruction::construct ( pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + + alib::XmlDataFactory::toStdout ( bndmMatcher ); } else if ( algorithm.getValue ( ) == "compressedBitParallelIndex" ) { string::String subject = alib::XmlDataFactory::fromTokens ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); diff --git a/tests.astringology.sh b/tests.astringology.sh index 33c521d1fe..711615434a 100755 --- a/tests.astringology.sh +++ b/tests.astringology.sh @@ -212,6 +212,7 @@ function runTest { clearResults } +runTest "BNDM Matcher" "./astringology2 -a bndmMatcher -p \"\$PATTERN_FILE\" | ./astringology2 -a bndmOccurrences -p - -s \"\$SUBJECT_FILE\" | ./astat2 -p size" runTest "Exact Boyer Moore" "./astringology2 -a boyerMoore -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -a <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size" runTest "Compressed Bit Parallelism Factors" "./astringology2 -a compressedBitParallelIndex -s \"\$SUBJECT_FILE\" | ./aquery2 -q compressedBitParallelismFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" runTest "Bit Parallelism Factors" "./astringology2 -a bitParallelIndex -s \"\$SUBJECT_FILE\" | ./aquery2 -q bitParallelismFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" -- GitLab