From 9489b6a42c40abfeee4af25ef5db84941da45a48 Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Fri, 24 Jan 2020 10:36:27 +0200 Subject: [PATCH] Quantum Leap string pattern matching algorithm --- .../QuickSearchBadCharacterShiftTable.cpp | 15 ---- .../properties/QuickSearchShiftTable.cpp | 15 ++++ ...erShiftTable.h => QuickSearchShiftTable.h} | 14 ++-- .../ReversedQuickSearchShiftTable.cpp | 15 ++++ .../ReversedQuickSearchShiftTable.h | 55 ++++++++++++++ .../QuantumLeapUsingQuickSearchShift.cpp | 15 ++++ .../exact/QuantumLeapUsingQuickSearchShift.h | 74 +++++++++++++++++++ alib2algo/src/stringology/exact/QuickSearch.h | 4 +- .../test-src/tests/exactMatching.cpp | 2 + 9 files changed, 185 insertions(+), 24 deletions(-) delete mode 100644 alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp create mode 100644 alib2algo/src/string/properties/QuickSearchShiftTable.cpp rename alib2algo/src/string/properties/{QuickSearchBadCharacterShiftTable.h => QuickSearchShiftTable.h} (66%) create mode 100644 alib2algo/src/string/properties/ReversedQuickSearchShiftTable.cpp create mode 100644 alib2algo/src/string/properties/ReversedQuickSearchShiftTable.h create mode 100644 alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.cpp create mode 100644 alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.h diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp deleted file mode 100644 index cf39042760..0000000000 --- a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp +++ /dev/null @@ -1,15 +0,0 @@ -/* - * QuickSearchBadCharacterShiftTable.cpp - * - * Created on: 23. 2. 2018 - * Author: Michal Cvach - */ - -#include "QuickSearchBadCharacterShiftTable.h" -#include <registration/AlgoRegistration.hpp> - -namespace { - -auto QuickSearchBadCharacterShiftTableLinearString = registration::AbstractRegister < string::properties::QuickSearchBadCharacterShiftTable, ext::map < DefaultSymbolType, size_t >, const string::LinearString < > & > ( string::properties::QuickSearchBadCharacterShiftTable::qsbcs ); - -} /* namespace */ diff --git a/alib2algo/src/string/properties/QuickSearchShiftTable.cpp b/alib2algo/src/string/properties/QuickSearchShiftTable.cpp new file mode 100644 index 0000000000..079eaa4ebe --- /dev/null +++ b/alib2algo/src/string/properties/QuickSearchShiftTable.cpp @@ -0,0 +1,15 @@ +/* + * QuickSearchShiftTable.cpp + * + * Created on: 23. 2. 2018 + * Author: Michal Cvach + */ + +#include "QuickSearchShiftTable.h" +#include <registration/AlgoRegistration.hpp> + +namespace { + +auto QuickSearchShiftTableLinearString = registration::AbstractRegister < string::properties::QuickSearchShiftTable, ext::map < DefaultSymbolType, size_t >, const string::LinearString < > & > ( string::properties::QuickSearchShiftTable::qss ); + +} /* namespace */ diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h b/alib2algo/src/string/properties/QuickSearchShiftTable.h similarity index 66% rename from alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h rename to alib2algo/src/string/properties/QuickSearchShiftTable.h index cba6760ab4..0e186613d8 100644 --- a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h +++ b/alib2algo/src/string/properties/QuickSearchShiftTable.h @@ -1,12 +1,12 @@ /* - * QuickSearchBadCharacterShiftTable.h + * QuickSearchShiftTable.h * * Created on: 23. 2. 2018 * Author: Michal Cvach */ -#ifndef _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ -#define _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ +#ifndef _STRINGOLOGY_QUICK_SEARCH_SHIFT_TABLE_H_ +#define _STRINGOLOGY_QUICK_SEARCH_SHIFT_TABLE_H_ #include <set> #include <map> @@ -20,19 +20,19 @@ namespace properties { /** * Computation of BCS table for the QuickSearch algorithm, as presented in the Daniel M. Sunday article. */ -class QuickSearchBadCharacterShiftTable { +class QuickSearchShiftTable { public: /** * Creates a bad character shift table which can be later used for the QuickSearch algorithm. * @return the BCS table in form of a map where key is the character from an alphabet and value is the shift. */ template < class SymbolType > - static ext::map < SymbolType, size_t > qsbcs ( const string::LinearString < SymbolType > & pattern ); + static ext::map < SymbolType, size_t > qss ( const string::LinearString < SymbolType > & pattern ); }; template < class SymbolType > -ext::map<SymbolType, size_t> QuickSearchBadCharacterShiftTable::qsbcs(const string::LinearString < SymbolType >& pattern) { +ext::map<SymbolType, size_t> QuickSearchShiftTable::qss(const string::LinearString < SymbolType >& pattern) { ext::map<SymbolType, size_t> bcs; /* Initialization of BCS. */ @@ -50,4 +50,4 @@ ext::map<SymbolType, size_t> QuickSearchBadCharacterShiftTable::qsbcs(const stri } /* namespace string */ -#endif /* _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ */ +#endif /* _STRINGOLOGY_QUICK_SEARCH_SHIFT_TABLE_H_ */ diff --git a/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.cpp b/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.cpp new file mode 100644 index 0000000000..9a99278fc8 --- /dev/null +++ b/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.cpp @@ -0,0 +1,15 @@ +/* + * ReversedQuickSearchShiftTable.cpp + * + * Created on: 24. 1. 2020 + * Author: Jan Travnicek + */ + +#include "ReversedQuickSearchShiftTable.h" +#include <registration/AlgoRegistration.hpp> + +namespace { + +auto ReversedQuickSearchShiftTableLinearString = registration::AbstractRegister < string::properties::ReversedQuickSearchShiftTable, ext::map < DefaultSymbolType, size_t >, const string::LinearString < > & > ( string::properties::ReversedQuickSearchShiftTable::rqss ); + +} /* namespace */ diff --git a/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.h b/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.h new file mode 100644 index 0000000000..cd28cfe4f6 --- /dev/null +++ b/alib2algo/src/string/properties/ReversedQuickSearchShiftTable.h @@ -0,0 +1,55 @@ +/* + * ReversedQuickSearchShiftTable.h + * + * Created on: 24. 1. 2020 + * Author: Jan Travnicek + */ + +#ifndef _STRINGOLOGY_REVERSED_QUICK_SEARCH_SHIFT_TABLE_H_ +#define _STRINGOLOGY_REVERSED_QUICK_SEARCH_SHIFT_TABLE_H_ + +#include <alib/set> +#include <alib/map> + +#include <string/LinearString.h> + +namespace string { + +namespace properties { + +/** + * Computation of BCS table for BMH from MI(E+\eps)-EVY course 2014 + * To get rid of zeros in BCS table we ignore last haystack character + */ +class ReversedQuickSearchShiftTable { +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + template < class SymbolType > + static ext::map < SymbolType, size_t > rqss ( const string::LinearString < SymbolType > & pattern ); + +}; + +template < class SymbolType > +ext::map < SymbolType, size_t > ReversedQuickSearchShiftTable::rqss ( const string::LinearString < SymbolType > & pattern ) { + const ext::set < SymbolType > & alphabet = pattern.getAlphabet ( ); + ext::map < SymbolType, size_t > bcs; + + /* Initialization of BCS to the length of the needle. */ + for ( const auto & symbol : alphabet ) + bcs.insert ( std::make_pair ( symbol, pattern.getContent ( ).size ( ) + 1 ) ); + + /* Filling out BCS, ignoring first character. */ + for ( ssize_t i = pattern.getContent ( ).size ( ) - 1; i >= 0; i-- ) + bcs[pattern.getContent ( ).at ( i )] = i + 1; + + return bcs; +} + +} /* namespace properties */ + +} /* namespace string */ + +#endif /* _STRINGOLOGY_REVERSED_QUICK_SEARCH_SHIFT_TABLE_H_ */ diff --git a/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.cpp b/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.cpp new file mode 100644 index 0000000000..2982f6c378 --- /dev/null +++ b/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.cpp @@ -0,0 +1,15 @@ +/* + * QuantumLeapUsingQuickSearchShift.cpp + * + * Created on: 24. 1. 2020 + * Author: Jan Travnicek + */ + +#include "QuantumLeapUsingQuickSearchShift.h" +#include <registration/AlgoRegistration.hpp> + +namespace { + +auto QuantumLeapUsingQuickSearchShiftLinearStringLinearString = registration::AbstractRegister < stringology::exact::QuantumLeapUsingQuickSearchShift, ext::set < unsigned >, const string::LinearString < > &, const string::LinearString < > &, size_t > ( stringology::exact::QuantumLeapUsingQuickSearchShift::match ); + +} /* namespace */ diff --git a/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.h b/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.h new file mode 100644 index 0000000000..cce556d466 --- /dev/null +++ b/alib2algo/src/stringology/exact/QuantumLeapUsingQuickSearchShift.h @@ -0,0 +1,74 @@ +/* + * QuantumLeapUsingQuickSearchShift.h + * + * Created on: 24. 1. 2020 + * Author: Jan Travnicek + */ + +#ifndef _QUICK_SEARCH_USING_QUICK_SEARCH_SHIFT_H_ +#define _QUICK_SEARCH_USING_QUICK_SEARCH_SHIFT_H_ + +#include <alib/set> +#include <alib/map> + +#include <string/LinearString.h> + +#include <string/properties/QuickSearchShiftTable.h> +#include <string/properties/ReversedQuickSearchShiftTable.h> + +namespace stringology { + +namespace exact { + +/** + * Implementation of DeadZone matching using bcs as shifting method to both directions + */ +class QuantumLeapUsingQuickSearchShift { +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + template < class SymbolType > + static ext::set < unsigned > match ( const string::LinearString < SymbolType > & string, const string::LinearString < SymbolType > & pattern, size_t z ); +}; + +template < class SymbolType > +ext::set < unsigned > QuantumLeapUsingQuickSearchShift::match ( const string::LinearString < SymbolType > & string, const string::LinearString < SymbolType > & pattern, size_t z ) { + ext::set < unsigned > occ; + ext::map < SymbolType, size_t > fqss = string::properties::QuickSearchShiftTable::qss ( pattern ); // NOTE: the subjects alphabet must be a subset or equal to the pattern + ext::map < SymbolType, size_t > bqss = string::properties::ReversedQuickSearchShiftTable::rqss ( pattern ); // NOTE: the subjects alphabet must be a subset or equal to the pattern + for ( const SymbolType & symbol : pattern.getAlphabet ( ) ) { + bqss [ symbol ] = z - bqss [ symbol ]; + } + + size_t haystack_offset = 0; + + while ( haystack_offset + pattern.getContent ( ).size ( ) <= string.getContent ( ).size ( ) ) { + size_t i = 0; + while ( i < pattern.getContent ( ).size ( ) && string.getContent ( ) [ haystack_offset + i ] == pattern.getContent ( ) [ i ] ) + i ++; + + // Yay, there is match!!! + if ( i == pattern.getContent ( ).size ( ) ) occ.insert ( haystack_offset ); + + if ( haystack_offset + pattern.getContent().size() == string.getContent().size() ) { // this is needed only because there is no terminating character in the string ... + break; // Here we don't do any more shifts if the pattern is already aligned at the utter end of the text + } + + size_t shf = fqss [ string.getContent ( ) [ haystack_offset + pattern.getContent ( ).size ( ) ] ]; + + size_t shb = z; + if ( haystack_offset + z - 1 < string.getContent ( ).size ( ) ) // this condition is needed because at worst MAX ( z - m, 0 ) additional characters are needed in the subject after its end + shb = bqss [ string.getContent ( ) [ haystack_offset + z - 1 ] ]; + haystack_offset += shf > shb ? z : shf; + } + + return occ; +} + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* _QUICK_SEARCH_USING_QUICK_SEARCH_SHIFT_H_ */ diff --git a/alib2algo/src/stringology/exact/QuickSearch.h b/alib2algo/src/stringology/exact/QuickSearch.h index af47eb9305..32e10365ea 100644 --- a/alib2algo/src/stringology/exact/QuickSearch.h +++ b/alib2algo/src/stringology/exact/QuickSearch.h @@ -14,7 +14,7 @@ #include <string/LinearString.h> -#include <string/properties/QuickSearchBadCharacterShiftTable.h> +#include <string/properties/QuickSearchShiftTable.h> #include <global/GlobalData.h> @@ -41,7 +41,7 @@ ext::set<unsigned> QuickSearch::match(const string::LinearString < SymbolType >& ext::set<unsigned> occ; measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); - ext::map<SymbolType, size_t> bcs = string::properties::QuickSearchBadCharacterShiftTable::qsbcs(pattern); //NOTE: the subjects alphabet must be a subset or equal to the pattern + ext::map<SymbolType, size_t> bcs = string::properties::QuickSearchShiftTable::qss(pattern); //NOTE: the subjects alphabet must be a subset or equal to the pattern measurements::end ( ); if(common::GlobalData::verbose) diff --git a/alib2integrationtest/test-src/tests/exactMatching.cpp b/alib2integrationtest/test-src/tests/exactMatching.cpp index 8266d4a16e..9f1421d952 100644 --- a/alib2integrationtest/test-src/tests/exactMatching.cpp +++ b/alib2integrationtest/test-src/tests/exactMatching.cpp @@ -1,5 +1,6 @@ #include <catch2/catch.hpp> #include <alib/vector> +#include <alib/string> #include "testing/TimeoutAqlTest.hpp" #include "testing/TestFiles.hpp" @@ -31,6 +32,7 @@ TEST_CASE ( "ExactMatching", "[integration]" ) { std::make_tuple ( "Exact Reversed Boyer Moore Horspool", " stringology::exact::ReversedBoyerMooreHorspool $subject $pattern", true ), std::make_tuple ( "Quick Search", "stringology::exact::QuickSearch $subject $pattern", true ), std::make_tuple ( "Exact Dead Zone Using Bad Character Shift", "stringology::exact::DeadZoneUsingBadCharacterShift $subject $pattern", true ), + std::make_tuple ( "Exact Quantum Leap Using Quick Search Shift", "stringology::exact::QuantumLeapUsingQuickSearchShift $subject $pattern " + ext::to_string ( 2 * PATTERN_SIZE ), true ), std::make_tuple ( "Exact Matching Automaton", "automaton::run::Occurrences <(stringology::matching::ExactMatchingAutomaton $pattern | automaton::determinize::Determinize -) $subject", true ), std::make_tuple ( "DAWG Factors", "stringology::indexing::ExactSuffixAutomaton $subject | stringology::query::SuffixAutomatonFactors - $pattern", false ), std::make_tuple ( "BNDM Matcher", "stringology::matching::BNDMMatcherConstruction $pattern | stringology::query::BNDMOccurrences - $subject", false ), -- GitLab