diff --git a/aarbology2/src/aarbology.cpp b/aarbology2/src/aarbology.cpp index b46c3ae537f12bac270b2ff8fe4ba4c32fd567b1..de1f1b84b56de96dcb599b2f2178ed00bba694f1 100644 --- a/aarbology2/src/aarbology.cpp +++ b/aarbology2/src/aarbology.cpp @@ -27,6 +27,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "exactPatternMatch" ); allowed.push_back ( "boyerMooreHorspool" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); + allowed.push_back ( "quickSearch" ); allowed.push_back ( "knuthMorrisPratt" ); allowed.push_back ( "deadZoneUsingBadCharacterShiftAndBorderArray" ); allowed.push_back ( "exactSubtreeMatchingAutomaton" ); @@ -89,6 +90,7 @@ int main ( int argc, char * argv[] ) { || algorithm.getValue ( ) == "exactPatternMatch" || algorithm.getValue ( ) == "boyerMooreHorspool" || algorithm.getValue ( ) == "reversedBoyerMooreHorspool" + || algorithm.getValue ( ) == "quickSearch" || algorithm.getValue ( ) == "knuthMorrisPratt" || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShiftAndBorderArray" || algorithm.getValue ( ) == "exactSubtreeAutomaton" @@ -118,6 +120,7 @@ int main ( int argc, char * argv[] ) { || algorithm.getValue ( ) == "exactPatternMatch" || algorithm.getValue ( ) == "boyerMooreHorspool" || algorithm.getValue ( ) == "reversedBoyerMooreHorspool" + || algorithm.getValue ( ) == "quickSearch" || algorithm.getValue ( ) == "knuthMorrisPratt" || algorithm.getValue ( ) == "deadZoneUsingBadCharacterShiftAndBorderArray" || algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" @@ -193,6 +196,8 @@ int main ( int argc, char * argv[] ) { cliCommand = "execute arbology::exact::BoyerMooreHorspool $subject $pattern > $output"; } else if ( algorithm.getValue ( ) == "reversedBoyerMooreHorspool" ) { cliCommand = "execute arbology::exact::ReversedBoyerMooreHorspool $subject $pattern > $output"; + } else if ( algorithm.getValue ( ) == "quickSearch" ) { + cliCommand = "execute arbology::exact::QuickSearch $subject $pattern > $output"; } else if ( algorithm.getValue ( ) == "knuthMorrisPratt" ) { cliCommand = "execute arbology::exact::KnuthMorrisPratt $subject $pattern > $output"; } else if ( algorithm.getValue ( ) == "deadZoneUsingBadCharacterShiftAndBorderArray" ) { diff --git a/alib2algo/src/arbology/exact/QuickSearch.cpp b/alib2algo/src/arbology/exact/QuickSearch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eec0497040697e9da0bbd18aa0ac437a669ed2ad --- /dev/null +++ b/alib2algo/src/arbology/exact/QuickSearch.cpp @@ -0,0 +1,21 @@ +/* + * QuickSearch.cpp + * + * Created on: 4. 3. 2018 + * Author: Michal Cvach + */ + +#include "QuickSearch.h" +#include <registration/AlgoRegistration.hpp> + +namespace arbology { + +namespace exact { + +auto QuickSearchPrefixRankedBarTreePrefixRankedBarTree = registration::AbstractRegister < QuickSearch, ext::set < unsigned >, const tree::PrefixRankedBarTree < > &, const tree::PrefixRankedBarTree < > & > ( QuickSearch::match ); +auto QuickSearchPrefixRankedBarTreePrefixRankedBarPattern = registration::AbstractRegister < QuickSearch, ext::set < unsigned >, const tree::PrefixRankedBarTree < > &, const tree::PrefixRankedBarPattern < > & > ( QuickSearch::match ); +auto QuickSearchPrefixRankedBarTreePrefixRankedBarNonlinearPattern = registration::AbstractRegister < QuickSearch, ext::set < unsigned >, const tree::PrefixRankedBarTree < > &, const tree::PrefixRankedBarNonlinearPattern < > & > ( QuickSearch::match ); + +} /* namespace exact */ + +} /* namespace arbology */ diff --git a/alib2algo/src/arbology/exact/QuickSearch.h b/alib2algo/src/arbology/exact/QuickSearch.h new file mode 100644 index 0000000000000000000000000000000000000000..3fde07a6308f4a59fb1c9722dec42e63714ae785 --- /dev/null +++ b/alib2algo/src/arbology/exact/QuickSearch.h @@ -0,0 +1,165 @@ +/* + * QuickSearch.h + * + * Created on: 4. 3. 2018 + * Author: Michal Cvach + */ + +#ifndef _ARBOLOGY_QUICK_SEARCH_H_ +#define _ARBOLOGY_QUICK_SEARCH_H_ + +#include <alib/set> +#include <alib/map> + +#include <common/ranked_symbol.hpp> + +#include <tree/properties/QuickSearchBadCharacterShiftTable.h> +#include <tree/properties/SubtreeJumpTable.h> +#include <tree/properties/ExactSubtreeRepeatsNaive.h> + +#include <tree/ranked/PrefixRankedBarTree.h> +#include <tree/ranked/PrefixRankedBarPattern.h> +#include <tree/ranked/PrefixRankedBarNonlinearPattern.h> + +namespace arbology { + +namespace exact { + +/** +* Implementation of the Quick Search algorithm for tree pattern matching. +* This variant searches the subject tree from left to right, while comparing matches from right to left. +* This algorithm makes use of a Bad character shift table as well as a Subtree jump table. +*/ +class QuickSearch { +public: + /** + * Search for a tree pattern in a tree. + * @return set set of occurences + */ + template < class SymbolType, class RankType > + static ext::set < unsigned > match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarTree < SymbolType, RankType > & pattern ); + template < class SymbolType, class RankType > + static ext::set < unsigned > match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarPattern < SymbolType, RankType > & pattern ); + template < class SymbolType, class RankType > + static ext::set < unsigned > match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarNonlinearPattern < SymbolType, RankType > & pattern ); + +}; + +template < class SymbolType, class RankType > +ext::set < unsigned > QuickSearch::match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarTree < SymbolType, RankType > & pattern ) { + return match ( subject, tree::PrefixRankedBarPattern < SymbolType, RankType > ( pattern ) ); +} + +template < class SymbolType, class RankType > // CURRENTLY WORKING HERE +ext::set < unsigned > QuickSearch::match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarPattern < SymbolType, RankType > & pattern ) { + ext::set < unsigned > occ; + ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > bcs = tree::properties::QuickSearchBadCharacterShiftTable::bcs ( pattern ); // NOTE: the subjects alphabet must be a subset or equal to the pattern + ext::vector < int > subjectSubtreeJumpTable = tree::properties::SubtreeJumpTable::compute ( subject ); + + // index to the subject + int i = 0; + + // main loop of the algorithm over all possible indexes where the pattern can start + while ( i + pattern.getContent ( ).size ( ) <= subject.getContent ( ).size ( ) ) { + + // index to the pattern + int j = pattern.getContent ( ).size ( ) - 1; + + // offset to the subject + int offset = i + j; + + while ( ( j >= 0 ) && ( offset >= 0 ) ) { + if ( subject.getContent ( )[offset] == pattern.getContent ( )[j] ) { + // match of symbol + offset = offset - 1; + j = j - 1; + } else if ( ( pattern.getContent ( )[j] == pattern.getVariablesBar ( ) ) /* && ( pattern.getBars ( ).count ( subject.getContent ( )[offset] ) ) */ ) { + // match of variable with subtree + offset = subjectSubtreeJumpTable[offset]; + j = j - 2; + } else { + break; + } + } + + // match was found + if ( j == -1 ) occ.insert ( offset + 1); + + if ( i + pattern.getContent ( ).size ( ) == subject.getContent ( ).size ( ) ) { + break; + } + + // shift heuristics + //std::cout << "At: " << i << ", shifting by: " << bcs[subject.getContent ( )[i + pattern.getContent ( ).size ( )]] << ", according to symb at: " << i + pattern.getContent ( ).size ( ) << "!" << std::endl; + i += bcs[subject.getContent ( )[i + pattern.getContent ( ).size ( )]]; + } + + return occ; +} + +template < class SymbolType, class RankType > +ext::set < unsigned > QuickSearch::match ( const tree::PrefixRankedBarTree < SymbolType, RankType > & subject, const tree::PrefixRankedBarNonlinearPattern < SymbolType, RankType > & pattern ) { + ext::set < unsigned > occ; + ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > bcs = tree::properties::QuickSearchBadCharacterShiftTable::bcs ( pattern ); //NOTE: the subjects alphabet must be a subset or equal to the pattern + ext::map < common::ranked_symbol < SymbolType, RankType >, unsigned > variablesSetting; + + ext::vector < int > subjectSubtreeJumpTable = tree::properties::SubtreeJumpTable::compute ( subject ); + tree::PrefixRankedBarTree < unsigned, RankType > repeats = tree::properties::ExactSubtreeRepeatsNaive::repeats ( subject ); + + // index to the subject + unsigned i = 0; + + // main loop of the algorithm over all possible indexes where the pattern can start + while ( i + pattern.getContent ( ).size ( ) <= subject.getContent ( ).size ( ) ) { + // clear the current state of variable to subtree repeat + variablesSetting.clear(); + + // index to the pattern + int j = pattern.getContent ( ).size ( ) - 1; + + // offset to the subject + int offset = i + j; + + while ( ( j >= 0 ) && ( offset >= 0 ) ) { + if ( subject.getContent ( )[offset] == pattern.getContent ( )[j] ) { + // match of symbol + offset = offset - 1; + j = j - 1; + } else if ( ( pattern.getContent ( )[j] == pattern.getVariablesBar ( ) ) /* && ( pattern.getBars ( ).count ( subject.getContent ( )[offset] ) ) */ ) { + // else match of variable with subtree + offset = subjectSubtreeJumpTable[offset]; + j = j - 2; + + // check nonlinear variable + if ( pattern.getNonlinearVariables ( ).count ( pattern.getContent ( )[ j + 1 ] ) ) { + auto setting = variablesSetting.find ( pattern.getContent ( )[ j + 1 ] ); + + if ( setting != variablesSetting.end ( ) && repeats.getContent ( )[ offset + 1 ].getSymbol ( ) != setting->second ) + break; + + variablesSetting.insert ( std::make_pair ( pattern.getContent ( )[ j + 1 ], repeats.getContent( )[ offset + 1 ].getSymbol ( ) ) ); + } + } else { + break; + } + } + + // match was found + if ( j == -1 ) occ.insert ( offset + 1); + + if ( i + pattern.getContent ( ).size ( ) == subject.getContent ( ).size ( ) ) { + break; + } + + // shift heuristics + i += bcs[subject.getContent ( )[i + pattern.getContent ( ).size ( )]]; + } + + return occ; +} + +} /* namespace exact */ + +} /* namespace arbology */ + +#endif /* _ARBOLOGY_QUICK_SEARCH_H_ */ diff --git a/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.cpp b/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f0de682358d9725709763fa45c2339130dbbc125 --- /dev/null +++ b/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.cpp @@ -0,0 +1,20 @@ +/* + * BadCharacterShiftTable.cpp + * + * Created on: 6. 3. 2018 + * Author: Michal Cvach + */ + +#include "QuickSearchBadCharacterShiftTable.h" +#include <registration/AlgoRegistration.hpp> + +namespace tree { + +namespace properties { + +auto QuickSearchBadCharacterShiftTablePrefixRankedBarPattern = registration::AbstractRegister < QuickSearchBadCharacterShiftTable, ext::map < common::ranked_symbol < >, size_t >, const tree::PrefixRankedBarPattern < > & > ( QuickSearchBadCharacterShiftTable::bcs ); +auto QuickSearchBadCharacterShiftTablePrefixRankedBarNonlinearPattern = registration::AbstractRegister < QuickSearchBadCharacterShiftTable, ext::map < common::ranked_symbol < >, size_t >, const tree::PrefixRankedBarNonlinearPattern < > & > ( QuickSearchBadCharacterShiftTable::bcs ); + +} /* namespace properties */ + +} /* namespace tree */ diff --git a/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.h b/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.h new file mode 100644 index 0000000000000000000000000000000000000000..d0cb8af17297f250213f93c932b07d7a892178c1 --- /dev/null +++ b/alib2algo/src/tree/properties/QuickSearchBadCharacterShiftTable.h @@ -0,0 +1,101 @@ +/* + * BadCharacterShiftTable.h + * + * Created on: 6. 3. 2018 + * Author: Michal Cvach + */ + +#ifndef _ARBOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ +#define _ARBOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ + +#include <alphabet/RankedSymbol.h> +#include <tree/ranked/PrefixRankedBarPattern.h> +#include <tree/ranked/PrefixRankedBarNonlinearPattern.h> + +#include <alib/set> +#include <alib/map> + +namespace tree { + +namespace properties { + +/** +* BadCharacterShiftTable for the QuickSearch algorithm for tree pattern matching. +*/ +class QuickSearchBadCharacterShiftTable { +public: + template < class SymbolType, class RankType > + static ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > bcs ( const tree::PrefixRankedBarPattern < SymbolType, RankType > & pattern ); + template < class SymbolType, class RankType > + static ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > bcs ( const tree::PrefixRankedBarNonlinearPattern < SymbolType, RankType > & pattern ); + +}; + +template < class SymbolType, class RankType > +ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > QuickSearchBadCharacterShiftTable::bcs ( const tree::PrefixRankedBarPattern < SymbolType, RankType > & pattern ) { + return bcs ( tree::PrefixRankedBarNonlinearPattern < SymbolType, RankType > ( pattern ) ); +} + +template < class SymbolType, class RankType > +ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > QuickSearchBadCharacterShiftTable::bcs ( const tree::PrefixRankedBarNonlinearPattern < SymbolType, RankType > & pattern ) { + const ext::set < common::ranked_symbol < SymbolType, RankType > > & alphabet = pattern.getAlphabet ( ); + + ext::map < common::ranked_symbol < SymbolType, RankType >, size_t > bcs; + + // initialisation of bcs table to the size of the pattern plus one + for ( const common::ranked_symbol < SymbolType, RankType > & symbol : alphabet ) { + if ( symbol == pattern.getSubtreeWildcard ( ) || pattern.getNonlinearVariables ( ).count ( symbol ) || symbol == pattern.getVariablesBar ( ) ) + continue; + + bcs.insert ( std::make_pair ( symbol, pattern.getContent ( ).size ( ) ) ); + } + + // find the distance between the end of the pattern and the index + // of the last symbol representing the variable + unsigned lastSOffset = pattern.getContent ( ).size ( ); + + for ( unsigned i = 0; i < pattern.getContent ( ).size ( ); i++ ) + if ( pattern.getContent ( )[i] == pattern.getSubtreeWildcard ( ) || pattern.getNonlinearVariables ( ).count ( pattern.getContent ( )[i] ) ) + lastSOffset = pattern.getContent ( ).size ( ) - i; + + // limit the shift by occurrence of the last variable + + for ( const common::ranked_symbol < SymbolType, RankType > & symbol : alphabet ) { + if ( symbol == pattern.getSubtreeWildcard ( ) || pattern.getNonlinearVariables ( ).count ( symbol ) || symbol == pattern.getVariablesBar ( ) ) + continue; + + size_t tmp = lastSOffset; + + if ( ! pattern.getBars ( ).count ( symbol ) ) + // size of the smallest subtree containing given terminal depend + // on the arity of the terminal + tmp += ( size_t ) symbol.getRank ( ) * 2; + else if ( tmp >= 2 ) + // bar symbols match the variable bar which is one symbol after + // the last variable, conditioned because of the case S S| where + // the -1 would cause shift by 0 -- illegal + tmp -= 1; + + if ( bcs[symbol] > tmp ) + bcs[symbol] = tmp; + } + + // limit the shift by position of symbols within the pattern + for ( unsigned i = 0; i < pattern.getContent ( ).size ( ); i++ ) { // last symbol is concerned here + if ( pattern.getContent ( )[i] == pattern.getSubtreeWildcard ( ) || pattern.getNonlinearVariables ( ).count ( pattern.getContent ( )[i] ) || pattern.getContent ( )[i] == pattern.getVariablesBar ( ) ) + continue; + + size_t tmp = pattern.getContent ( ).size ( ) - i; + + if ( bcs[pattern.getContent ( )[i]] > tmp ) + bcs[pattern.getContent ( )[i]] = tmp; + } + + return bcs; +} + +} /* namespace properties */ + +} /* namespace tree */ + +#endif /* _ARBOLOGY_QUICK_SEARCH_BAD_CHARACTER_SHIFT_TABLE_H_ */ diff --git a/tests.aarbology.sh b/tests.aarbology.sh index e252e203c39068fc7c6f1312b46340c501b39408..e83415e51f09939c6cf1559840c7ad463a498195 100755 --- a/tests.aarbology.sh +++ b/tests.aarbology.sh @@ -446,6 +446,8 @@ runTestPattern "Exact Reversed Boyer Moore Horspool (Pattern PrefixRankedBar)" " runTestPattern "Exact Reversed Boyer Moore Horspool (Pattern PrefixRanked)" "./aarbology2 -a reversedBoyerMooreHorspool -s <( ./acast2 -t PrefixRankedTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" runTestNonlinearPattern "Exact Reversed Boyer Moore Horspool (NonlinearPattern PrefixRankedBar)" "./aarbology2 -a reversedBoyerMooreHorspool -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarNonlinearPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" runTestNonlinearPattern "Exact Reversed Boyer Moore Horspool (NonlinearPattern PrefixRanked)" "./aarbology2 -a reversedBoyerMooreHorspool -s <( ./acast2 -t PrefixRankedTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedNonlinearPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" +runTestPattern "Exact Quick Search (Pattern PrefixRankedBar)" "./aarbology2 -a quickSearch -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" +runTestNonlinearPattern "Exact Quick Search (NonlinearPattern PrefixRankedBar)" "./aarbology2 -a quickSearch -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarNonlinearPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" runTestPattern "Exact Knuth Morris Pratt (Pattern PrefixRankedBar)" "./aarbology2 -a knuthMorrisPratt -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarPattern -i \"\$PATTERN_FILE\" ) | ./astat2 -p size" runTestPattern "Exact Knuth Morris Pratt (Pattern PrefixRanked)" "./aarbology2 -a knuthMorrisPratt -s <( ./acast2 -t PrefixRankedTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedPattern -i \"\$PATTERN_FILE\" ) | ./astat2 -p size" runTestPattern "Exact Dead Zone Using Bad Character Shift And Border Array (Pattern PrefixRanked)" "./aarbology2 -a deadZoneUsingBadCharacterShiftAndBorderArray -s <( ./acast2 -t PrefixRankedTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size" @@ -453,3 +455,5 @@ runTestPattern "Exact Dead Zone Using Bad Character Shift And Border Array (Patt runTestPattern "Exact Pattern Matching Automaton (Pattern Tree)" "./arun2 -t occurrences -a <(./aarbology2 -a exactPatternMatchingAutomaton -p <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <(./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\")) | ./adeterminize2) -i \"\$SUBJECT_FILE\" | ./astat2 -p size" runTestPattern "Exact Pattern Matching Automaton (PrefixRankedBar)" "./aarbology2 -a exactPatternMatchingAutomaton -p <(./acast2 -t PrefixRankedBarPattern -i <(./aaccess2 --tree alphabet -o add -i \"\$PATTERN_FILE\" -c <( ./aaccess2 --tree alphabet -o get -i \"\$SUBJECT_FILE\" ) ) ) | ./adeterminize2 | ./arun2 -t occurrences -a - -i <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" | ./acast2 -t LinearString ) | ./astat2 -p size" + +