diff --git a/alib2algo/src/stringology/query/PositionHeapFactors.cpp b/alib2algo/src/stringology/query/PositionHeapFactors.cpp new file mode 100644 index 0000000000000000000000000000000000000000..83424e8a1112e1dc413422916d8b4f3774bea5e9 --- /dev/null +++ b/alib2algo/src/stringology/query/PositionHeapFactors.cpp @@ -0,0 +1,24 @@ +/* + * PositionHeapFactors.cpp + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#include "PositionHeapFactors.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace query { + +std::set < unsigned > PositionHeapFactors::query ( const indexes::PositionHeap < DefaultSymbolType > & suffixTrie, const string::String & string ) { + return dispatch ( suffixTrie, string.getData ( ) ); +} + +auto PositionHeapFactorsLinearString = PositionHeapFactors::RegistratorWrapper < std::set < unsigned >, string::LinearString < > > ( PositionHeapFactors::query ); + +} /* namespace query */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/query/PositionHeapFactors.h b/alib2algo/src/stringology/query/PositionHeapFactors.h new file mode 100644 index 0000000000000000000000000000000000000000..665d6d3a4268c68a928b7935ff45152defc7dfca --- /dev/null +++ b/alib2algo/src/stringology/query/PositionHeapFactors.h @@ -0,0 +1,103 @@ +/* + * PositionHeapFactors.h + * + * Created on: 2. 1. 2017 + * Author: Jan Travnicek + */ + +#ifndef POSITION_HEAP_FACTORS_H_ +#define POSITION_HEAP_FACTORS_H_ + +#include <indexes/PositionHeap.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <global/GlobalData.h> + +namespace stringology { + +namespace query { + +/** + * Query position heap for given string. + * + * Source: Position heaps: A simple and dynamic text indexing data structure + * Andrzej Ehrenfeucht, Ross M. McConnell, Nissa Osheim, Sung-Whan Woo + * + */ + +class PositionHeapFactors : public std::SingleDispatchFirstStaticParam < PositionHeapFactors, std::set < unsigned >, const indexes::PositionHeap < DefaultSymbolType > &, const string::StringBase & > { + template < class SymbolType > + static void accumulateResult ( const std::trie < SymbolType, unsigned > & trie, std::set < unsigned > & res, unsigned indexedStringSize ) { + res.insert ( indexedStringSize - trie.getData ( ) ); + + for ( const std::pair < SymbolType, std::trie < SymbolType, unsigned > > & child : trie.getChildren ( ) ) { + accumulateResult ( child.second, res, indexedStringSize ); + } + } + + template < class SymbolType > + static bool checkOcc ( const string::LinearString < SymbolType > & needle, unsigned validSymbols, const std::vector < SymbolType > & haystack, unsigned haystackPosition ) { + if ( haystackPosition + needle.getContent ( ).size ( ) > haystack.size ( ) ) + return false; + + for ( unsigned i = validSymbols; i < needle.getContent ( ).size ( ); i++ ) { + if ( needle.getContent ( ) [ i ] != haystack [ haystackPosition + i ] ) + return false; + } + return true; + } + +public: + /** + * Query a suffix trie + * @param suffix trie to query + * @param string string to query by + * @return occurences of factors + */ + static std::set < unsigned > query ( const indexes::PositionHeap < DefaultSymbolType > & positionHeap, const string::String & string ); + + template < class SymbolType > + static std::set < unsigned > query ( const indexes::PositionHeap < SymbolType > & positionHeap, const string::LinearString < SymbolType > & string ); + +}; + +template < class SymbolType > +std::set < unsigned > PositionHeapFactors::query ( const indexes::PositionHeap < SymbolType > & positionHeap, const string::LinearString < SymbolType > & string ) { + std::set < unsigned > res; + + const std::trie < SymbolType, unsigned > * node = & positionHeap.getRoot ( ); + unsigned depth = 0; + unsigned indexedStringSize = positionHeap.getString ( ).size ( ); + + if(common::GlobalData::verbose) + std::clog << "on path possible occ (raw, string index): (" << node->getData ( ) << ", " << indexedStringSize - node->getData ( ) << ")"; + + if ( checkOcc ( string, depth, positionHeap.getString ( ), indexedStringSize - node->getData ( ) ) ) + res.insert ( indexedStringSize - node->getData ( ) ); + + for ( const SymbolType & symbol : string.getContent ( ) ) { + auto iter = node->getChildren ( ).find ( symbol ); + if ( iter == node->getChildren ( ).end ( ) ) + return res; + + depth++; + node = & iter->second; + + if(common::GlobalData::verbose) + std::clog << "on path possible occ (raw, string index): (" << node->getData ( ) << ", " << indexedStringSize - node->getData ( ) << ")"; + + if ( checkOcc ( string, depth, positionHeap.getString ( ), indexedStringSize - node->getData ( ) ) ) + res.insert ( indexedStringSize - node->getData ( ) ); + } + + for ( const std::pair < SymbolType, std::trie < SymbolType, unsigned > > & child : node->getChildren ( ) ) + accumulateResult ( child.second, res, indexedStringSize ); + return res; +} + +} /* namespace query */ + +} /* namespace stringology */ + +#endif /* POSITION_HEAP_FACTORS_H_ */ diff --git a/aquery2/src/aquery.cpp b/aquery2/src/aquery.cpp index 93322a3f62c016daf08e91e8c5561fbb4a51a0be..7b1f9b34aeed917165c18504c0d6b12d9ec9fc17 100644 --- a/aquery2/src/aquery.cpp +++ b/aquery2/src/aquery.cpp @@ -17,6 +17,7 @@ #include <stringology/query/SuffixTrieFactors.h> #include <stringology/query/SuffixArrayFactors.h> +#include <stringology/query/PositionHeapFactors.h> int main ( int argc, char * argv[] ) { try { @@ -28,6 +29,7 @@ int main ( int argc, char * argv[] ) { std::vector < std::string > allowed; allowed.push_back ( "suffixTrieFactors" ); allowed.push_back ( "suffixArrayFactors" ); + allowed.push_back ( "positionHeapFactors" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); TCLAP::ValueArg < std::string > query ( "q", "query", "Query index", false, "exactFactorMatch", & allowedVals ); @@ -80,6 +82,19 @@ int main ( int argc, char * argv[] ) { measurements::end ( ); measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); + } else if ( query.getValue ( ) == "positionHeapFactors" ) { + indexes::PositionHeap < > positionHeap = alib::XmlDataFactory::fromTokens < indexes::PositionHeap < > > ( sax::FromXMLParserHelper::parseInput ( indexInput ) ); + string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + std::set < unsigned > res = stringology::query::PositionHeapFactors::query ( positionHeap, pattern ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + alib::XmlDataFactory::toStdout ( res ); } else { throw exception::CommonException ( "Invalid algorithm" ); diff --git a/tests.astringology.sh b/tests.astringology.sh index d0ee088d7ccf4e1b916bca958ff93f0b7230598d..251e2a12d35c74b10893227dda0fd7b4d81dfd7c 100755 --- a/tests.astringology.sh +++ b/tests.astringology.sh @@ -212,6 +212,7 @@ function runTest { clearResults } +runTest "Position Heap Factors" "./astringology2 -a positionHeap -s \"\$SUBJECT_FILE\" | ./aquery2 -q positionHeapFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" runTest "Suffix Array Factors" "./astringology2 -a suffixArray -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixArrayFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" runTest "Suffix Trie Factors" "./astringology2 -a suffixTrie -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixTrieFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size" runTest "Exact Boyer Moore Horspool" "./astringology2 -a boyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -a <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"