diff --git a/aarbology2/src/aarbology.cpp b/aarbology2/src/aarbology.cpp index 6a7fef4fca30be83c2b99e7f38091d75be407aa1..64f74261fa7e82505b43efc5c49b79b06b54b2d9 100644 --- a/aarbology2/src/aarbology.cpp +++ b/aarbology2/src/aarbology.cpp @@ -20,6 +20,7 @@ #include <arbology/exact/BoyerMooreHorspool.h> #include <arbology/exact/ReversedBoyerMooreHorspool.h> #include <arbology/exact/KnuthMorrisPratt.h> +#include <arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.h> #include <arbology/exact/ExactSubtreeMatchingAutomaton.h> #include <arbology/exact/ExactPatternMatchingAutomaton.h> #include <arbology/exact/ExactSubtreeAutomaton.h> @@ -37,6 +38,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "boyerMooreHorspool" ); allowed.push_back ( "reversedBoyerMooreHorspool" ); allowed.push_back ( "knuthMorrisPratt" ); + allowed.push_back ( "deadZoneUsingBadCharacterShiftAndBorderArray" ); allowed.push_back ( "exactSubtreeMatchingAutomaton" ); allowed.push_back ( "exactPatternMatchingAutomaton" ); allowed.push_back ( "exactSubtreeAutomaton" ); @@ -75,6 +77,8 @@ int main ( int argc, char * argv[] ) { needPattern = needSubject = 1; } else if ( algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" ) { needPattern = 1; + } else if ( algorithm.getValue ( ) == "deadZoneUsingBadCharacterShiftAndBorderArray" ) { + needPattern = needSubject = 1; } else { } @@ -193,6 +197,21 @@ int main ( int argc, char * argv[] ) { std::chrono::measurements::end ( ); std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY ); + alib::XmlDataFactory::toStdout ( res ); + } else if ( algorithm.getValue ( ) == "deadZoneUsingBadCharacterShiftAndBorderArray" ) { + tree::Tree subject = alib::XmlDataFactory::fromTokens < tree::Tree > ( subjectTokens.front ( ) ); + tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) ); + + std::chrono::measurements::end ( ); + std::chrono::measurements::start ( "Algorithm", std::chrono::measurements::Type::MAIN ); + + std::set < unsigned > res = arbology::exact::DeadZoneUsingBadCharacterShiftAndBorderArray::match ( subject, pattern ); + if( ends.isSet ( ) ) + res = arbology::transform::BeginToEndIndex::transform(subject, res); + + std::chrono::measurements::end ( ); + std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY ); + alib::XmlDataFactory::toStdout ( res ); } else if ( algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" ) { tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) ); diff --git a/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.cpp b/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6592f0b27da2b1cc81017903184c6bfb9875c0ad --- /dev/null +++ b/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.cpp @@ -0,0 +1,81 @@ +/* + * DeadZoneUsingBadCharacterShiftAndBorderArray.cpp + * + * Created on: 5. 11. 2014 + * Author: Jan Travnicek + */ + +#include "DeadZoneUsingBadCharacterShiftAndBorderArray.h" +#include "ReversedBadCharacterShiftTable.h" +#include "BorderArrayNaive.h" +#include "SubtreeJumpTable.h" + +#include <exception/AlibException.h> +#include <tree/Tree.h> +#include <tree/ranked/PrefixRankedBarTree.h> +#include <tree/ranked/PrefixRankedBarPattern.h> +#include <alphabet/RankedSymbol.h> + +#include <map> + +namespace arbology { + +namespace exact { + +std::set < unsigned > DeadZoneUsingBadCharacterShiftAndBorderArray::match ( const tree::Tree & subject, const tree::Tree & pattern ) { + return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) ); +} + +std::set < unsigned > DeadZoneUsingBadCharacterShiftAndBorderArray::match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarTree & pattern ) { + return match ( subject, tree::PrefixRankedBarPattern ( pattern ) ); +} + +auto DeadZoneUsingBadCharacterShiftAndBorderArrayPrefixRankedBarTreePrefixRankedBarTree = DeadZoneUsingBadCharacterShiftAndBorderArray::RegistratorWrapper < std::set < unsigned >, tree::PrefixRankedBarTree, tree::PrefixRankedBarTree > ( DeadZoneUsingBadCharacterShiftAndBorderArray::getInstance ( ), DeadZoneUsingBadCharacterShiftAndBorderArray::match ); + +std::set < unsigned > DeadZoneUsingBadCharacterShiftAndBorderArray::match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern ) { + std::set < unsigned > occ; + std::map < alphabet::RankedSymbol, size_t > bbcs = ReversedBadCharacterShiftTable::bcs ( pattern ); // NOTE: the subjects alphabet must be a subset or equal to the pattern + std::vector < size_t > fba = BorderArrayNaive::ba ( pattern ); + std::vector < int > subjectSubtreeJumpTable = SubtreeJumpTable::compute ( subject ); + + match_rec ( occ, subject, pattern, fba, bbcs, subjectSubtreeJumpTable, 0, subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) + 1 ); + return occ; +} + +void DeadZoneUsingBadCharacterShiftAndBorderArray::match_rec ( std::set < unsigned > & occ, const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern, std::vector < size_t > & fba, std::map < alphabet::RankedSymbol, size_t > & bbcs, std::vector < int > & subjectSubtreeJumpTable, int low, int high ) { + if ( low >= high ) return; + + int i = ( low + high ) / 2; + + // index to the pattern + unsigned j = 0; + + // offset to the subject + unsigned offset = i; + + while ( ( j < pattern.getContent ( ).size ( ) ) && ( offset < subject.getContent ( ).size ( ) ) ) { + if ( subject.getContent ( )[offset] == pattern.getContent ( )[j] ) { + // match of symbol + offset++; + j++; + } else if ( ( pattern.getContent ( )[j] == pattern.getSubtreeWildcard ( ) ) && ( subject.getContent ( )[offset].getSymbol ( ) != pattern.getBarSymbol ( ) ) ) { + // match of variable with subtree + offset = subjectSubtreeJumpTable[offset]; + j += 2; + } else { + break; + } + } + + // match was found + if ( j >= pattern.getContent ( ).size ( ) ) occ.insert ( i ); + + match_rec ( occ, subject, pattern, fba, bbcs, subjectSubtreeJumpTable, low, i - bbcs[subject.getContent ( )[i]] + 1 ); + match_rec ( occ, subject, pattern, fba, bbcs, subjectSubtreeJumpTable, i + j - fba[j], high ); +} + +auto DeadZoneUsingBadCharacterShiftAndBorderArrayPrefixRankedBarTreePrefixRankedBarPattern = DeadZoneUsingBadCharacterShiftAndBorderArray::RegistratorWrapper < std::set < unsigned >, tree::PrefixRankedBarTree, tree::PrefixRankedBarPattern > ( DeadZoneUsingBadCharacterShiftAndBorderArray::getInstance ( ), DeadZoneUsingBadCharacterShiftAndBorderArray::match ); + +} /* namespace exact */ + +} /* namespace arbology */ diff --git a/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.h b/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.h new file mode 100644 index 0000000000000000000000000000000000000000..50a4354da861d009141b7a427f37e9ed4e9ce58c --- /dev/null +++ b/alib2algo/src/arbology/exact/DeadZoneUsingBadCharacterShiftAndBorderArray.h @@ -0,0 +1,48 @@ +/* + * DeadZoneUsingBadCharacterShiftAndBorderArray.h + * + * Created on: 5. 11. 2014 + * Author: Jan Travnicek + */ + +#ifndef _DEAD_ZONE_USING_BAD_CHARACTER_SHIFT_AND_BORDER_ARRAY_H_ +#define _DEAD_ZONE_USING_BAD_CHARACTER_SHIFT_AND_BORDER_ARRAY_H_ + +#include <set> +#include <vector> +#include <common/multipleDispatch.hpp> +#include <tree/TreeFeatures.h> +#include <alphabet/RankedSymbol.h> + +namespace arbology { + +namespace exact { + +/** + * Implementation of DeadZone matching using bad character shift as shifting method on one direction and border array on the other + */ +class DeadZoneUsingBadCharacterShiftAndBorderArray : public std::DoubleDispatch < std::set < unsigned >, tree::TreeBase, tree::TreeBase > { +public: + /** + * Search for pattern in linear string. + * @return set set of occurences + */ + static std::set < unsigned > match ( const tree::Tree & subject, const tree::Tree & pattern ); + + static std::set < unsigned > match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarTree & pattern ); + static std::set < unsigned > match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern ); + static void match_rec ( std::set < unsigned > & occ, const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern, std::vector < size_t > & fba, std::map < alphabet::RankedSymbol, size_t > & bbcs, std::vector < int > & subjectSubtreeJumpTable, int low, int high ); + + static DeadZoneUsingBadCharacterShiftAndBorderArray & getInstance ( ) { + static DeadZoneUsingBadCharacterShiftAndBorderArray res; + + return res; + } + +}; + +} /* namespace exact */ + +} /* namespace arbology */ + +#endif /* _DEAD_ZONE_USING_BAD_CHARACTER_SHIFT_AND_BORDER_ARRAY_H_ */ diff --git a/tests.aarbology.sh b/tests.aarbology.sh index 38be0dae539920c9d0eecd195f2ac166a2e56850..3912b4f00763590473a1c00431de1a47a9a036c2 100755 --- a/tests.aarbology.sh +++ b/tests.aarbology.sh @@ -257,6 +257,8 @@ function runTestPatternEnds { outputResults } +runTestPattern "Exact Dead Zone Using Bad Character Shift And Border Array (Pattern PrefixRankedBar)" "./aarbology2 -a deadZoneUsingBadCharacterShiftAndBorderArray -s <( ./acast2 -t PrefixRankedBarTree -i \"\$SUBJECT_FILE\" ) -p <( ./acast2 -t PrefixRankedBarPattern -i <(./alphabetManip2 -o add -i \"\$PATTERN_FILE\" -a <(./alphabetManip2 -o get -i \"\$SUBJECT_FILE\"))) | ./astat2 -p size --set" + runTestPatternEnds "Exact Pattern Matching Automaton (PrefixRanked)" "./aarbology2 -a exactPatternMatchingAutomaton -p <(./acast2 -t PrefixRankedPattern -i <(./alphabetManip2 -o add -i \"\$PATTERN_FILE\" -a <( ./alphabetManip2 -o get -i \"\$SUBJECT_FILE\" ) ) ) | ./adeterminize2 | ./arun2 -t occurrences -a - -i <( ./acast2 -t PrefixRankedTree -i \"\$SUBJECT_FILE\" | ./acast2 -t LinearString ) | ./astat2 -p size --set" RAND_SIZE_SUBJECT=100