diff --git a/aarbology2/src/aarbology.cpp b/aarbology2/src/aarbology.cpp index e4eef2b9590f4041baa1ea0eea686f94e27dc908..d114e0eb9e06290dea01b70d923721e64147f216 100644 --- a/aarbology2/src/aarbology.cpp +++ b/aarbology2/src/aarbology.cpp @@ -17,6 +17,7 @@ #include <arbology/exact/ExactSubtreeMatch.h> #include <arbology/exact/ExactPatternMatch.h> #include <arbology/exact/BoyerMooreHorspool.h> +#include <arbology/exact/KnuthMorrisPratt.h> #include <arbology/exact/ExactSubtreeMatchingAutomaton.h> #include <chrono> @@ -28,6 +29,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "exactSubtreeMatch" ); allowed.push_back ( "exactPatternMatch" ); allowed.push_back ( "boyerMooreHorspool" ); + allowed.push_back ( "knuthMorrisPratt" ); allowed.push_back ( "exactSubtreeMatchingAutomaton" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); @@ -137,6 +139,19 @@ int main ( int argc, char * argv[] ) { std::chrono::measurements::end ( ); std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY ); + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "knuthMorrisPratt" ) { + tree::Tree subject = alib::XmlDataFactory::fromTokens < tree::Tree > ( subjectTokens.front ( ) ); + tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) ); + + std::chrono::measurements::end ( ); + std::chrono::measurements::start ( "Algorithm", std::chrono::measurements::Type::MAIN ); + + std::set < unsigned > res = arbology::exact::KnuthMorrisPratt::match ( subject, pattern ); + + std::chrono::measurements::end ( ); + std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY ); + alib::XmlDataFactory::toStdout ( res ); } else if ( algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" ) { tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) ); diff --git a/alib2algo/src/arbology/exact/BorderArrayNaive.cpp b/alib2algo/src/arbology/exact/BorderArrayNaive.cpp new file mode 100644 index 0000000000000000000000000000000000000000..88f3374bbf5e4c866a94106e1582e2c44a733266 --- /dev/null +++ b/alib2algo/src/arbology/exact/BorderArrayNaive.cpp @@ -0,0 +1,67 @@ +/* + * BorderArrayNaive.cpp + * + * Created on: 5. 11. 2014 + * Author: Radomir Polach, Tomas Pecka, Jan Travnicek + */ + +#include "BorderArrayNaive.h" +#include "SubtreeJumpTable.h" + +#include <exception/AlibException.h> +#include <tree/ranked/PrefixRankedBarPattern.h> + +namespace arbology { + +namespace exact { + +std::vector < size_t > BorderArrayNaive::ba ( const tree::RankedTreeWrapper & pattern ) { + return getInstance ( ).dispatch ( pattern.getData ( ) ); +} + +bool BorderArrayNaive::matches ( const tree::PrefixRankedBarPattern & pattern, const std::vector < int > & subtreeJumpTable, int stop, int offset ) { + unsigned i = 1; + + while ( offset < stop && i < pattern.getContent ( ).size ( ) ) + if ( pattern.getContent ( )[i] == pattern.getContent ( )[offset] ) { + i++; + offset++; + } else if ( ( pattern.getContent ( )[i] == pattern.getSubtreeWildcard ( ) ) || ( pattern.getContent ( )[offset] == pattern.getSubtreeWildcard ( ) ) ) { + i = subtreeJumpTable[i]; + offset = subtreeJumpTable[offset]; + } else { + return false; + } + + return true; +} + +std::vector < size_t > BorderArrayNaive::ba ( const tree::PrefixRankedBarPattern & pattern ) { + std::vector < int > patternSubtreeJumpTable = SubtreeJumpTable::compute ( pattern ); + std::vector < size_t > res; + + for ( unsigned i = 0; i <= pattern.getContent ( ).size ( ); i++ ) + res.push_back ( 0 ); + + res[0] = -1; + + for ( unsigned i = 1; i <= pattern.getContent ( ).size ( ); i++ ) { + int min = i; + + for ( unsigned j = 1; j < i; j++ ) + if ( matches ( pattern, patternSubtreeJumpTable, i, j ) ) { + min = j; + break; + } + + res[i] = i - min; + } + + return res; +} + +auto BorderArrayPrefixRankedBarPattern = BorderArrayNaive::RegistratorWrapper < std::vector < size_t >, tree::PrefixRankedBarPattern > ( BorderArrayNaive::getInstance ( ), BorderArrayNaive::ba ); + +} /* namespace exact */ + +} /* namespace arbology */ diff --git a/alib2algo/src/arbology/exact/BorderArrayNaive.h b/alib2algo/src/arbology/exact/BorderArrayNaive.h new file mode 100644 index 0000000000000000000000000000000000000000..92925628f7530f646913685d4ec5bfdc0829063f --- /dev/null +++ b/alib2algo/src/arbology/exact/BorderArrayNaive.h @@ -0,0 +1,53 @@ +/* + * BorderArrayNaive.h + * + * Created on: 5. 11. 2014 + * Author: Jan Travnicek + */ + +#ifndef _ARBOLOGY_BORDER_ARRAY_NAIVE_H_ +#define _ARBOLOGY_BORDER_ARRAY_NAIVE_H_ + +#include <tree/RankedTreeWrapper.h> +#include <tree/TreeFeatures.h> +#include <common/multipleDispatch.hpp> + +#include <vector> + +namespace arbology { + +namespace exact { + +/** + * Computation of BCS table for BMH from MI(E+\eps)-EVY course 2014 + * To get rid of zeros in BCS table we ignore last haystack character + */ +class BorderArrayNaive : public std::SingleDispatch < std::vector < size_t >, tree::RankedTreeBase > { + static bool matches ( const tree::PrefixRankedBarPattern & pattern, const std::vector < int > & subtreeJumpTable, int stop, int offset ); + +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::vector < size_t > ba ( const tree::RankedTreeWrapper & pattern ); + + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::vector < size_t > ba ( const tree::PrefixRankedBarPattern & pattern ); + + static BorderArrayNaive & getInstance ( ) { + static BorderArrayNaive res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace arbology */ + +#endif /* _ARBOLOGY_BORDER_ARRAY_NAIVE_H_ */ diff --git a/alib2algo/src/arbology/exact/KnuthMorrisPratt.cpp b/alib2algo/src/arbology/exact/KnuthMorrisPratt.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3bcca32c4b97461b80df382c997f03547edd4b84 --- /dev/null +++ b/alib2algo/src/arbology/exact/KnuthMorrisPratt.cpp @@ -0,0 +1,79 @@ +/* + * KnuthMorrisPratt.cpp + * + * Created on: 5. 11. 2014 + * Author: Jan Travnicek + */ + +#include "KnuthMorrisPratt.h" +#include "BorderArrayNaive.h" +#include "SubtreeJumpTable.h" + +#include <exception/AlibException.h> +#include <tree/Tree.h> +#include <tree/ranked/PrefixRankedBarTree.h> +#include <tree/ranked/PrefixRankedBarPattern.h> +#include <alphabet/RankedSymbol.h> + +#include <map> + +namespace arbology { + +namespace exact { + +std::set < unsigned > KnuthMorrisPratt::match ( const tree::Tree & subject, const tree::Tree & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > KnuthMorrisPratt::match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarTree & pattern ) { + return match ( subject, tree::PrefixRankedBarPattern ( pattern ) ); +} + +auto KnuthMorrisPrattPrefixRankedBarTreePrefixRankedBarTree = KnuthMorrisPratt::RegistratorWrapper < std::set < unsigned >, tree::PrefixRankedBarTree, tree::PrefixRankedBarTree > ( KnuthMorrisPratt::getInstance ( ), KnuthMorrisPratt::match ); + +std::set < unsigned > KnuthMorrisPratt::match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern ) { + std::set < unsigned > occ; + std::vector < size_t > ba = BorderArrayNaive::ba ( pattern ); + std::vector < int > subjectSubtreeJumpTable = SubtreeJumpTable::compute ( subject ); + + // index to the subject + unsigned i = 0; + + // main loop of the algorithm over all possible indexes where the pattern can start + while ( i + pattern.getContent ( ).size ( ) <= subject.getContent ( ).size ( ) ) { + + // index to the pattern + unsigned j = 0; + + // offset to the subject + unsigned offset = i; + + while ( ( j < pattern.getContent ( ).size ( ) ) && ( offset < subject.getContent ( ).size ( ) ) ) { + if ( subject.getContent ( )[offset] == pattern.getContent ( )[j] ) { + // match of symbol + offset++; + j++; + } else if ( ( pattern.getContent ( )[j] == pattern.getSubtreeWildcard ( ) ) && ( subject.getContent ( )[offset].getSymbol ( ) != pattern.getBarSymbol ( ) ) ) { + // match of variable with subtree + offset = subjectSubtreeJumpTable[offset]; + j += 2; + } else { + break; + } + } + + // match was found + if ( j >= pattern.getContent ( ).size ( ) ) occ.insert ( i ); + + // shift heristics + i += j - ba[j]; + } + + return occ; +} + +auto KnuthMorrisPrattPrefixRankedBarTreePrefixRankedBarPattern = KnuthMorrisPratt::RegistratorWrapper < std::set < unsigned >, tree::PrefixRankedBarTree, tree::PrefixRankedBarPattern > ( KnuthMorrisPratt::getInstance ( ), KnuthMorrisPratt::match ); + +} /* namespace exact */ + +} /* namespace arbology */ diff --git a/alib2algo/src/arbology/exact/KnuthMorrisPratt.h b/alib2algo/src/arbology/exact/KnuthMorrisPratt.h new file mode 100644 index 0000000000000000000000000000000000000000..36261e05256c1edf72283bfedd628c5302342cda --- /dev/null +++ b/alib2algo/src/arbology/exact/KnuthMorrisPratt.h @@ -0,0 +1,46 @@ +/* + * KnuthMorrisPratt.h + * + * Created on: 5. 11. 2014 + * Author: Jan Travnicek + */ + +#ifndef _ARBOLOGY_KNUTH_MORRIS_PRATT_H_ +#define _ARBOLOGY_KNUTH_MORRIS_PRATT_H_ + +#include <set> +#include <common/multipleDispatch.hpp> +#include <tree/TreeFeatures.h> + +namespace arbology { + +namespace exact { + +/** + * Implementation of BMH for MI(E+\eps)-EVY course 2014 + * To get rid of zeros in BCS table we ignore last haystack character + */ +class KnuthMorrisPratt : public std::DoubleDispatch < std::set < unsigned >, tree::TreeBase, tree::TreeBase > { +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const tree::Tree & subject, const tree::Tree & pattern ); + + static std::set < unsigned > match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarTree & pattern ); + static std::set < unsigned > match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern ); + + static KnuthMorrisPratt & getInstance ( ) { + static KnuthMorrisPratt res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace arbology */ + +#endif /* _ARBOLOGY_KNUTH_MORRIS_PRATT_H_ */ diff --git a/alib2algo/src/arbology/exact/SubtreeJumpTable.cpp b/alib2algo/src/arbology/exact/SubtreeJumpTable.cpp index 39996c9abbfc770ef2101d15f79500bb58043fca..a779265238721d3ef89361acef29349b0e1051d2 100644 --- a/alib2algo/src/arbology/exact/SubtreeJumpTable.cpp +++ b/alib2algo/src/arbology/exact/SubtreeJumpTable.cpp @@ -9,7 +9,9 @@ #include <exception/AlibException.h> #include <tree/ranked/PrefixRankedTree.h> +#include <tree/ranked/PrefixRankedPattern.h> #include <tree/ranked/PrefixRankedBarTree.h> +#include <tree/ranked/PrefixRankedBarPattern.h> #include <alphabet/RankedSymbol.h> #include <map> @@ -25,35 +27,56 @@ std::vector < int > SubtreeJumpTable::compute ( const tree::RankedTreeWrapper & std::vector < int > SubtreeJumpTable::compute ( const tree::PrefixRankedBarTree & subject ) { std::vector < int > res; - buildDataPointers ( res, subject, 0 ); + buildDataPointersBar ( res, subject, 0 ); return res; } auto SubtreeSizesPrefixRankedBarTree = SubtreeJumpTable::RegistratorWrapper < std::vector < int >, tree::PrefixRankedBarTree > ( SubtreeJumpTable::getInstance ( ), SubtreeJumpTable::compute ); +std::vector < int > SubtreeJumpTable::compute ( const tree::PrefixRankedBarPattern & pattern ) { + std::vector < int > res; + + buildDataPointersBar ( res, pattern, 0 ); + + return res; +} + +auto SubtreeSizesPrefixRankedBarPattern = SubtreeJumpTable::RegistratorWrapper < std::vector < int >, tree::PrefixRankedBarPattern > ( SubtreeJumpTable::getInstance ( ), SubtreeJumpTable::compute ); + std::vector < int > SubtreeJumpTable::compute ( const tree::PrefixRankedTree & subject ) { std::vector < int > res; - buildDataPointers ( res, subject, 0 ); + buildDataPointersPrefixRanked ( res, subject, 0 ); return res; } auto SubtreeSizesPrefixRankedTree = SubtreeJumpTable::RegistratorWrapper < std::vector < int >, tree::PrefixRankedTree > ( SubtreeJumpTable::getInstance ( ), SubtreeJumpTable::compute ); +std::vector < int > SubtreeJumpTable::compute ( const tree::PrefixRankedPattern & pattern ) { + std::vector < int > res; + + buildDataPointersPrefixRanked ( res, pattern, 0 ); + + return res; +} + +auto SubtreeSizesPrefixRankedPattern = SubtreeJumpTable::RegistratorWrapper < std::vector < int >, tree::PrefixRankedPattern > ( SubtreeJumpTable::getInstance ( ), SubtreeJumpTable::compute ); + /** * used to compute subtree jump table. * @param begin - index of a root node of a complete subtree to process * @return index, increased by one, of the last node in the subtree starting at index begin */ -int SubtreeJumpTable::buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedBarTree & subject, int begin ) { +template < class T > +int SubtreeJumpTable::buildDataPointersBar ( std::vector < int > & res, const T & subject, int begin ) { res.push_back ( 0 ); int index = begin + 1; if ( subject.getContent ( )[begin].getSymbol ( ) != subject.getBarSymbol ( ) ) for ( unsigned i = 0; i < subject.getContent ( )[begin].getRank ( ).getData ( ); i++ ) - index = buildDataPointers ( res, subject, index ); + index = buildDataPointersBar ( res, subject, index ); index++; res[begin] = index; @@ -66,18 +89,20 @@ int SubtreeJumpTable::buildDataPointers ( std::vector < int > & res, const tree: * @param begin - index of a root node of a complete subtree to process * @return index, increased by one, of the last node in the subtree starting at index begin */ -int SubtreeJumpTable::buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedTree & subject, int begin ) { +template < class T > +int SubtreeJumpTable::buildDataPointersPrefixRanked ( std::vector < int > & res, const T & subject, int begin ) { for ( unsigned i = 0; i < subject.getContent ( ).size ( ); i++ ) res.push_back ( 0 ); - return buildDataPointersInternal ( res, subject, begin ); + return buildDataPointersPrefixRankedInternal ( res, subject, begin ); } -int SubtreeJumpTable::buildDataPointersInternal ( std::vector < int > & res, const tree::PrefixRankedTree & subject, int begin ) { +template < class T > +int SubtreeJumpTable::buildDataPointersPrefixRankedInternal ( std::vector < int > & res, const T & subject, int begin ) { int index = begin + 1; for ( unsigned i = 0; i < subject.getContent ( )[begin].getRank ( ).getData ( ); i++ ) - index = buildDataPointersInternal ( res, subject, index ); + index = buildDataPointersPrefixRankedInternal ( res, subject, index ); res[begin] = index; return index; diff --git a/alib2algo/src/arbology/exact/SubtreeJumpTable.h b/alib2algo/src/arbology/exact/SubtreeJumpTable.h index 526a22d71050a6be01ca17fe72cf6d5a20cab739..67907331a75df3738c24287f2ba46c528da7111e 100644 --- a/alib2algo/src/arbology/exact/SubtreeJumpTable.h +++ b/alib2algo/src/arbology/exact/SubtreeJumpTable.h @@ -20,15 +20,20 @@ namespace arbology { namespace exact { class SubtreeJumpTable : public std::SingleDispatch < std::vector < int >, tree::RankedTreeBase > { + template < class T > + static int buildDataPointersBar ( std::vector < int > & res, const T & subject, int begin ); + template < class T > + static int buildDataPointersPrefixRanked ( std::vector < int > & res, const T & subject, int begin ); + template < class T > + static int buildDataPointersPrefixRankedInternal ( std::vector < int > & res, const T & subject, int begin ); + public: static std::vector < int > compute ( const tree::RankedTreeWrapper & subject ); static std::vector < int > compute ( const tree::PrefixRankedBarTree & subject ); + static std::vector < int > compute ( const tree::PrefixRankedBarPattern & pattern ); static std::vector < int > compute ( const tree::PrefixRankedTree & subject ); - - static int buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedBarTree & subject, int begin ); - static int buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedTree & subject, int begin ); - static int buildDataPointersInternal ( std::vector < int > & res, const tree::PrefixRankedTree & subject, int begin ); + static std::vector < int > compute ( const tree::PrefixRankedPattern & pattern ); static SubtreeJumpTable & getInstance ( ) { static SubtreeJumpTable res; diff --git a/tests.aarbology.sh b/tests.aarbology.sh index 46ae1378a311d7c7d59d1120ab079c1b2aaa2c78..ed0d8881617acf4576bb0c368ee4bd1d67f13d02 100755 --- a/tests.aarbology.sh +++ b/tests.aarbology.sh @@ -213,3 +213,4 @@ runTestSubtree "Exact Boyer Moore Horspool" "./aarbology2 -a boyerMooreHorspool runTestSubtree "Exact Subtree Automaton" "./arun2 -t occurrences -a <(./aarbology2 -a exactSubtreeMatchingAutomaton -p \"\$PATTERN_FILE\" | ./adeterminize2) -i \"\$SUBJECT_FILE\" | ./astat2 -p quantity -s" runTestPattern "Exact Boyer Moore Horspool" "./aarbology2 -a boyerMooreHorspool -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarPattern -i \"\$PATTERN_FILE\" ) | ./astat2 -p quantity -s" +runTestPattern "Exact Knuth Morris Pratt" "./aarbology2 -a knuthMorrisPratt -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarPattern -i \"\$PATTERN_FILE\" ) | ./astat2 -p quantity -s"