From 81ffaab4670fadca6405eef716ac8d227c92a88e Mon Sep 17 00:00:00 2001 From: Michal Cvach <cvachmic@fit.cvut.cz> Date: Sun, 11 Mar 2018 16:28:50 +0100 Subject: [PATCH] Added Quick Search exact stringology matching algorithm. --- .../QuickSearchBadCharacterShiftTable.cpp | 19 +++++ .../QuickSearchBadCharacterShiftTable.h | 53 ++++++++++++++ .../src/stringology/exact/QuickSearch.cpp | 19 +++++ alib2algo/src/stringology/exact/QuickSearch.h | 72 +++++++++++++++++++ astringology2/src/astringology.cpp | 9 +++ tests.astringology.sh | 1 + 6 files changed, 173 insertions(+) create mode 100644 alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp create mode 100644 alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h create mode 100644 alib2algo/src/stringology/exact/QuickSearch.cpp create mode 100644 alib2algo/src/stringology/exact/QuickSearch.h diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp new file mode 100644 index 0000000000..d92d82fce5 --- /dev/null +++ b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp @@ -0,0 +1,19 @@ +/* + * QuickSearchBadCharacterShiftTable.cpp + * + * Created on: 23. 2. 2018 + * Author: Michal Cvach + */ + +#include "QuickSearchBadCharacterShiftTable.h" +#include <registration/AlgoRegistration.hpp> + +namespace string { + +namespace properties { + +auto QuickSearchBadCharacterShiftTableLinearString = registration::AbstractRegister < QuickSearchBadCharacterShiftTable, ext::map < DefaultSymbolType, size_t >, const string::LinearString < > & > ( QuickSearchBadCharacterShiftTable::qsbcs ); + +} /* namespace properties */ + +} /* namespace string */ diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h new file mode 100644 index 0000000000..cba6760ab4 --- /dev/null +++ b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h @@ -0,0 +1,53 @@ +/* + * QuickSearchBadCharacterShiftTable.h + * + * Created on: 23. 2. 2018 + * Author: Michal Cvach + */ + +#ifndef _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ +#define _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ + +#include <set> +#include <map> + +#include <string/LinearString.h> + +namespace string { + +namespace properties { + +/** +* Computation of BCS table for the QuickSearch algorithm, as presented in the Daniel M. Sunday article. +*/ +class QuickSearchBadCharacterShiftTable { +public: + /** + * Creates a bad character shift table which can be later used for the QuickSearch algorithm. + * @return the BCS table in form of a map where key is the character from an alphabet and value is the shift. + */ + template < class SymbolType > + static ext::map < SymbolType, size_t > qsbcs ( const string::LinearString < SymbolType > & pattern ); + +}; + +template < class SymbolType > +ext::map<SymbolType, size_t> QuickSearchBadCharacterShiftTable::qsbcs(const string::LinearString < SymbolType >& pattern) { + ext::map<SymbolType, size_t> bcs; + + /* Initialization of BCS. */ + for(const SymbolType & symbol : pattern.getAlphabet ( ) ) + bcs.insert(std::make_pair(symbol, pattern.getContent().size() + 1)); + + /* Filling out BCS. */ + for(size_t i = 0; i < pattern.getContent().size(); i++) + bcs [ pattern.getContent ( ) [ i ] ] = pattern.getContent().size() - i; + + return bcs; +} + +} /* namespace properties */ + +} /* namespace string */ + +#endif /* _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ */ diff --git a/alib2algo/src/stringology/exact/QuickSearch.cpp b/alib2algo/src/stringology/exact/QuickSearch.cpp new file mode 100644 index 0000000000..edc3f10b1c --- /dev/null +++ b/alib2algo/src/stringology/exact/QuickSearch.cpp @@ -0,0 +1,19 @@ +/* + * QuickSearch.cpp + * + * Created on: 23. 2. 2018 + * Author: Michal Cvach + */ + +#include "QuickSearch.h" +#include <registration/AlgoRegistration.hpp> + +namespace stringology { + +namespace exact { + +auto QuickSearchLinearString = registration::AbstractRegister < QuickSearch, ext::set < unsigned >, const string::LinearString < > &, const string::LinearString < > & > ( QuickSearch::match ); + +} /* namespace exact */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/exact/QuickSearch.h b/alib2algo/src/stringology/exact/QuickSearch.h new file mode 100644 index 0000000000..aecf370b88 --- /dev/null +++ b/alib2algo/src/stringology/exact/QuickSearch.h @@ -0,0 +1,72 @@ +/* + * QuickSearch.h + * + * Created on: 23. 2. 2018 + * Author: Michal Cvach + */ + +#ifndef _STRINGOLOGY_QUICK_SEARCH_H_ +#define _STRINGOLOGY_QUICK_SEARCH_H_ + +#include <set> +#include <map> +#include <measure> + +#include <string/LinearString.h> + +#include <string/properties/QuickSearchBadCharacterShiftTable.h> + +namespace stringology { + +namespace exact { + +/** +* Implementation of the QuickSearch substring matching algorithm as presented in the Daniel M. Sunday article. +*/ +class QuickSearch { +public: + /** + * Search for pattern in linear string. + * @return set of occurences + */ + template < class SymbolType > + static ext::set < unsigned > match ( const string::LinearString < SymbolType > & subject, const string::LinearString < SymbolType > & pattern ); + +}; + +template < class SymbolType > +ext::set<unsigned> QuickSearch::match(const string::LinearString < SymbolType >& string, const string::LinearString < SymbolType >& pattern) { + ext::set<unsigned> occ; + + measurements::start ( "Preprocess", measurements::Type::PREPROCESS ); + ext::map<SymbolType, size_t> bcs = string::properties::QuickSearchBadCharacterShiftTable::qsbcs(pattern); //NOTE: the subjects alphabet must be a subset or equal to the pattern + measurements::end ( ); + + measurements::start ( "Algorithm", measurements::Type::ALGORITHM ); + size_t i = 0; + size_t j; + while( i + pattern.getContent().size() <= string.getContent().size() ) { + for ( j = 0; j < pattern.getContent().size(); j++ ) + if ( pattern.getContent()[j] != string.getContent()[i+j]) + break; + + if ( j == pattern.getContent ( ).size ( ) ) { + occ.insert(i); + } + + if ( i + pattern.getContent().size() == string.getContent().size() ) { + break; // Here we don't do any more shifts if the pattern is already aligned at the utter end of the text + } + + i += bcs[string.getContent()[i+pattern.getContent().size()]]; + } + measurements::end ( ); + + return occ; +} + +} /* namespace exact */ + +} /* namespace stringology */ + +#endif /* _STRINGOLOGY_QUICK_SEARCH_H_ */ diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp index 6e615893f6..5b93d30954 100644 --- a/astringology2/src/astringology.cpp +++ b/astringology2/src/astringology.cpp @@ -27,6 +27,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "boyerMooreHorspool" ); allowed.push_back ( "boyerMoore" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); + allowed.push_back ( "quickSearch" ); allowed.push_back ( "deadZoneUsingBadCharacterShift" ); allowed.push_back ( "exactMatchingAutomaton" ); @@ -49,6 +50,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "borderArray" ); allowed.push_back ( "badCharacterShiftTable" ); + allowed.push_back ( "quickSearchBadCharacterShiftTable"); allowed.push_back ( "goodSuffixShiftTable" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); @@ -84,6 +86,7 @@ int main ( int argc, char * argv[] ) { || algorithm.getValue ( ) == "boyerMooreHorspool" || algorithm.getValue ( ) == "boyerMoore" || algorithm.getValue ( ) == "reversedBoyerMooreHorspool" + || algorithm.getValue ( ) == "quickSearch" || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift" || algorithm.getValue ( ) == "bndmOccurrences" || algorithm.getValue ( ) == "backwardOracleMatching" @@ -115,6 +118,7 @@ int main ( int argc, char * argv[] ) { || algorithm.getValue ( ) == "boyerMooreHorspool" || algorithm.getValue ( ) == "boyerMoore" || algorithm.getValue ( ) == "reversedBoyerMooreHorspool" + || algorithm.getValue ( ) == "quickSearch" || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift" || algorithm.getValue ( ) == "bndmOccurrences" || algorithm.getValue ( ) == "backwardOracleMatching" @@ -124,6 +128,7 @@ int main ( int argc, char * argv[] ) { || algorithm.getValue ( ) == "exactNondeterministicSuffixAutomaton" || algorithm.getValue ( ) == "bndmMatcher" || algorithm.getValue ( ) == "badCharacterShiftTable" + || algorithm.getValue ( ) == "quickSearchBadCharacterShiftTable" || algorithm.getValue ( ) == "goodSuffixShiftTable" ) { std::string input; if ( patternInput.getValue ( ).size ( ) == 0 ) @@ -163,6 +168,8 @@ int main ( int argc, char * argv[] ) { cliCommand = "execute stringology::exact::BoyerMoore $subject $pattern > $output"; } else if ( algorithm.getValue ( ) == "reversedBoyerMooreHorspool" ) { cliCommand = "execute stringology::exact::ReversedBoyerMooreHorspool $subject $pattern > $output"; + } else if ( algorithm.getValue ( ) == "quickSearch" ) { + cliCommand = "execute stringology::exact::QuickSearch $subject $pattern > $output"; } else if ( algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift" ) { cliCommand = "execute stringology::exact::DeadZoneUsingBadCharacterShift $subject $pattern > $output"; @@ -207,6 +214,8 @@ int main ( int argc, char * argv[] ) { cliCommand = "execute string::properties::BorderArray $subject > $output"; } else if ( algorithm.getValue ( ) == "badCharacterShiftTable" ) { cliCommand = "execute string::properties::BadCharacterShiftTable $pattern > $output"; + } else if ( algorithm.getValue ( ) == "quickSeachBadCharacterShiftTable" ) { + cliCommand = "execute string::properties::QuickSearchBadCharacterShiftTable $pattern > $output"; } else if ( algorithm.getValue ( ) == "goodSuffixShiftTable" ) { cliCommand = "execute string::properties::GoodSuffixShiftTable $pattern > $output"; } else { diff --git a/tests.astringology.sh b/tests.astringology.sh index 56110840da..5d8dd19307 100755 --- a/tests.astringology.sh +++ b/tests.astringology.sh @@ -222,5 +222,6 @@ runTest "Suffix Array Factors" "./astringology2 -a suffixArray -s \"\$SUBJECT_FI runTest "Suffix Trie Factors" "./astringology2 -a suffixTrie -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixTrieFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" runTest "Exact Boyer Moore Horspool" "./astringology2 -a boyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size" runTest "Exact Reversed Boyer Moore Horspool" "./astringology2 -a reversedBoyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size" +runTest "Quick Search" "./astringology2 -a quickSearch -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size" runTest "Exact Matching Automaton" "./arun2 -t occurrences -a <(./astringology2 -a exactMatchingAutomaton -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./adeterminize2) -i \"\$SUBJECT_FILE\" | ./astat2 -p size" runTest "Exact Dead Zone Using Bad Character Shift" "./astringology2 -a deadZoneUsingBadCharacterShift -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size" -- GitLab