From 81ffaab4670fadca6405eef716ac8d227c92a88e Mon Sep 17 00:00:00 2001
From: Michal Cvach <cvachmic@fit.cvut.cz>
Date: Sun, 11 Mar 2018 16:28:50 +0100
Subject: [PATCH] Added Quick Search exact stringology matching algorithm.

---
 .../QuickSearchBadCharacterShiftTable.cpp     | 19 +++++
 .../QuickSearchBadCharacterShiftTable.h       | 53 ++++++++++++++
 .../src/stringology/exact/QuickSearch.cpp     | 19 +++++
 alib2algo/src/stringology/exact/QuickSearch.h | 72 +++++++++++++++++++
 astringology2/src/astringology.cpp            |  9 +++
 tests.astringology.sh                         |  1 +
 6 files changed, 173 insertions(+)
 create mode 100644 alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp
 create mode 100644 alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h
 create mode 100644 alib2algo/src/stringology/exact/QuickSearch.cpp
 create mode 100644 alib2algo/src/stringology/exact/QuickSearch.h

diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp
new file mode 100644
index 0000000000..d92d82fce5
--- /dev/null
+++ b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.cpp
@@ -0,0 +1,19 @@
+/*
+ * QuickSearchBadCharacterShiftTable.cpp
+ *
+ *  Created on: 23. 2. 2018
+ *	  Author: Michal Cvach
+ */
+
+#include "QuickSearchBadCharacterShiftTable.h"
+#include <registration/AlgoRegistration.hpp>
+
+namespace string {
+
+namespace properties {
+
+auto QuickSearchBadCharacterShiftTableLinearString = registration::AbstractRegister < QuickSearchBadCharacterShiftTable, ext::map < DefaultSymbolType, size_t >, const string::LinearString < > & > ( QuickSearchBadCharacterShiftTable::qsbcs );
+
+} /* namespace properties */
+
+} /* namespace string */
diff --git a/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h
new file mode 100644
index 0000000000..cba6760ab4
--- /dev/null
+++ b/alib2algo/src/string/properties/QuickSearchBadCharacterShiftTable.h
@@ -0,0 +1,53 @@
+/*
+ * QuickSearchBadCharacterShiftTable.h
+ *
+ *  Created on: 23. 2. 2018
+ *	  Author: Michal Cvach
+ */
+
+#ifndef _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_
+#define _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_
+
+#include <set>
+#include <map>
+
+#include <string/LinearString.h>
+
+namespace string {
+
+namespace properties {
+
+/**
+* Computation of BCS table for the QuickSearch algorithm, as presented in the Daniel M. Sunday article.
+*/
+class QuickSearchBadCharacterShiftTable {
+public:
+	/**
+	 * Creates a bad character shift table which can be later used for the QuickSearch algorithm.
+	 * @return the BCS table in form of a map where key is the character from an alphabet and value is the shift.
+	 */
+	template < class SymbolType >
+	static ext::map < SymbolType, size_t > qsbcs ( const string::LinearString < SymbolType > & pattern );
+
+};
+
+template < class SymbolType >
+ext::map<SymbolType, size_t> QuickSearchBadCharacterShiftTable::qsbcs(const string::LinearString < SymbolType >& pattern) {
+	ext::map<SymbolType, size_t> bcs;
+
+	/* Initialization of BCS. */
+	for(const SymbolType & symbol : pattern.getAlphabet ( ) )
+		bcs.insert(std::make_pair(symbol, pattern.getContent().size() + 1));
+
+	/* Filling out BCS. */
+	for(size_t i = 0; i < pattern.getContent().size(); i++)
+		bcs [ pattern.getContent ( ) [ i ] ] = pattern.getContent().size() - i;
+
+	return bcs;
+}
+
+} /* namespace properties */
+
+} /* namespace string */
+
+#endif /* _STRINGOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ */
diff --git a/alib2algo/src/stringology/exact/QuickSearch.cpp b/alib2algo/src/stringology/exact/QuickSearch.cpp
new file mode 100644
index 0000000000..edc3f10b1c
--- /dev/null
+++ b/alib2algo/src/stringology/exact/QuickSearch.cpp
@@ -0,0 +1,19 @@
+/*
+ * QuickSearch.cpp
+ *
+ *  Created on: 23. 2. 2018
+ *	  Author: Michal Cvach
+ */
+
+#include "QuickSearch.h"
+#include <registration/AlgoRegistration.hpp>
+
+namespace stringology {
+
+namespace exact {
+
+auto QuickSearchLinearString = registration::AbstractRegister < QuickSearch, ext::set < unsigned >, const string::LinearString < > &, const string::LinearString < > & > ( QuickSearch::match );
+
+} /* namespace exact */
+
+} /* namespace stringology */
diff --git a/alib2algo/src/stringology/exact/QuickSearch.h b/alib2algo/src/stringology/exact/QuickSearch.h
new file mode 100644
index 0000000000..aecf370b88
--- /dev/null
+++ b/alib2algo/src/stringology/exact/QuickSearch.h
@@ -0,0 +1,72 @@
+/*
+ * QuickSearch.h
+ *
+ *  Created on: 23. 2. 2018
+ *	  Author: Michal Cvach
+ */
+
+#ifndef _STRINGOLOGY_QUICK_SEARCH_H_
+#define _STRINGOLOGY_QUICK_SEARCH_H_
+
+#include <set>
+#include <map>
+#include <measure>
+
+#include <string/LinearString.h>
+
+#include <string/properties/QuickSearchBadCharacterShiftTable.h>
+
+namespace stringology {
+
+namespace exact {
+
+/**
+* Implementation of the QuickSearch substring matching algorithm as presented in the Daniel M. Sunday article.
+*/
+class QuickSearch {
+public:
+	/**
+	 * Search for pattern in linear string.
+	 * @return set of occurences
+	 */
+	template < class SymbolType >
+	static ext::set < unsigned > match ( const string::LinearString < SymbolType > & subject, const string::LinearString < SymbolType > & pattern );
+
+};
+
+template < class SymbolType >
+ext::set<unsigned> QuickSearch::match(const string::LinearString < SymbolType >& string, const string::LinearString < SymbolType >& pattern) {
+	ext::set<unsigned> occ;
+
+	measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
+	ext::map<SymbolType, size_t> bcs = string::properties::QuickSearchBadCharacterShiftTable::qsbcs(pattern); //NOTE: the subjects alphabet must be a subset or equal to the pattern
+	measurements::end ( );
+
+	measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
+	size_t i = 0;
+	size_t j;
+	while( i + pattern.getContent().size() <= string.getContent().size() ) {
+		for ( j = 0; j < pattern.getContent().size(); j++ )
+			if ( pattern.getContent()[j] != string.getContent()[i+j])
+				break;
+
+		if ( j == pattern.getContent ( ).size ( ) ) {
+			occ.insert(i);
+		}
+
+		if ( i + pattern.getContent().size() == string.getContent().size() ) {
+			break; // Here we don't do any more shifts if the pattern is already aligned at the utter end of the text
+		}
+
+		i += bcs[string.getContent()[i+pattern.getContent().size()]];
+	}
+	measurements::end ( );
+
+	return occ;
+}
+
+} /* namespace exact */
+
+} /* namespace stringology */
+
+#endif /* _STRINGOLOGY_QUICK_SEARCH_H_ */
diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp
index 6e615893f6..5b93d30954 100644
--- a/astringology2/src/astringology.cpp
+++ b/astringology2/src/astringology.cpp
@@ -27,6 +27,7 @@ int main ( int argc, char * argv[] ) {
 		allowed.push_back ( "boyerMooreHorspool" );
 		allowed.push_back ( "boyerMoore" );
 		allowed.push_back ( "reversedBoyerMooreHorspool" );
+		allowed.push_back ( "quickSearch" );
 		allowed.push_back ( "deadZoneUsingBadCharacterShift" );
 
 		allowed.push_back ( "exactMatchingAutomaton" );
@@ -49,6 +50,7 @@ int main ( int argc, char * argv[] ) {
 
 		allowed.push_back ( "borderArray" );
 		allowed.push_back ( "badCharacterShiftTable" );
+		allowed.push_back ( "quickSearchBadCharacterShiftTable");
 		allowed.push_back ( "goodSuffixShiftTable" );
 		TCLAP::ValuesConstraint < std::string > allowedVals ( allowed );
 
@@ -84,6 +86,7 @@ int main ( int argc, char * argv[] ) {
 		  || algorithm.getValue ( ) == "boyerMooreHorspool"
 		  || algorithm.getValue ( ) == "boyerMoore"
 		  || algorithm.getValue ( ) == "reversedBoyerMooreHorspool"
+		  || algorithm.getValue ( ) == "quickSearch"
 		  || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift"
 		  || algorithm.getValue ( ) == "bndmOccurrences"
 		  || algorithm.getValue ( ) == "backwardOracleMatching"
@@ -115,6 +118,7 @@ int main ( int argc, char * argv[] ) {
 		  || algorithm.getValue ( ) == "boyerMooreHorspool"
 		  || algorithm.getValue ( ) == "boyerMoore"
 		  || algorithm.getValue ( ) == "reversedBoyerMooreHorspool"
+		  || algorithm.getValue ( ) == "quickSearch"
 		  || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift"
 		  || algorithm.getValue ( ) == "bndmOccurrences"
 		  || algorithm.getValue ( ) == "backwardOracleMatching"
@@ -124,6 +128,7 @@ int main ( int argc, char * argv[] ) {
 		  || algorithm.getValue ( ) == "exactNondeterministicSuffixAutomaton"
 		  || algorithm.getValue ( ) == "bndmMatcher"
 		  || algorithm.getValue ( ) == "badCharacterShiftTable"
+		  || algorithm.getValue ( ) == "quickSearchBadCharacterShiftTable"
 		  || algorithm.getValue ( ) == "goodSuffixShiftTable" ) {
 			std::string input;
 			if ( patternInput.getValue ( ).size ( ) == 0 )
@@ -163,6 +168,8 @@ int main ( int argc, char * argv[] ) {
 			cliCommand = "execute stringology::exact::BoyerMoore $subject $pattern > $output";
 		} else if ( algorithm.getValue ( ) == "reversedBoyerMooreHorspool" ) {
 			cliCommand = "execute stringology::exact::ReversedBoyerMooreHorspool $subject $pattern > $output";
+		} else if ( algorithm.getValue ( ) == "quickSearch" ) {
+			cliCommand = "execute stringology::exact::QuickSearch $subject $pattern > $output";
 		} else if ( algorithm.getValue ( ) == "deadZoneUsingBadCharacterShift" ) {
 			cliCommand = "execute stringology::exact::DeadZoneUsingBadCharacterShift $subject $pattern > $output";
 
@@ -207,6 +214,8 @@ int main ( int argc, char * argv[] ) {
 			cliCommand = "execute string::properties::BorderArray $subject > $output";
 		} else if ( algorithm.getValue ( ) == "badCharacterShiftTable" ) {
 			cliCommand = "execute string::properties::BadCharacterShiftTable $pattern > $output";
+		} else if ( algorithm.getValue ( ) == "quickSeachBadCharacterShiftTable" ) {
+			cliCommand = "execute string::properties::QuickSearchBadCharacterShiftTable $pattern > $output";
 		} else if ( algorithm.getValue ( ) == "goodSuffixShiftTable" ) {
 			cliCommand = "execute string::properties::GoodSuffixShiftTable $pattern > $output";
 		} else {
diff --git a/tests.astringology.sh b/tests.astringology.sh
index 56110840da..5d8dd19307 100755
--- a/tests.astringology.sh
+++ b/tests.astringology.sh
@@ -222,5 +222,6 @@ runTest "Suffix Array Factors" "./astringology2 -a suffixArray -s \"\$SUBJECT_FI
 runTest "Suffix Trie Factors" "./astringology2 -a suffixTrie -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixTrieFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size"
 runTest "Exact Boyer Moore Horspool" "./astringology2 -a boyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"
 runTest "Exact Reversed Boyer Moore Horspool" "./astringology2 -a reversedBoyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"
+runTest "Quick Search" "./astringology2 -a quickSearch -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"
 runTest "Exact Matching Automaton" "./arun2 -t occurrences -a <(./astringology2 -a exactMatchingAutomaton -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./adeterminize2) -i \"\$SUBJECT_FILE\" | ./astat2 -p size"
 runTest "Exact Dead Zone Using Bad Character Shift" "./astringology2 -a deadZoneUsingBadCharacterShift -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"
-- 
GitLab