Skip to content
Snippets Groups Projects
Commit c6344c4a authored by Jan Trávníček's avatar Jan Trávníček
Browse files

querying position heap

parent af0a2a57
No related branches found
No related tags found
No related merge requests found
/*
* PositionHeapFactors.cpp
*
* Created on: 2. 1. 2017
* Author: Jan Travnicek
*/
#include "PositionHeapFactors.h"
#include <string/LinearString.h>
namespace stringology {
namespace query {
std::set < unsigned > PositionHeapFactors::query ( const indexes::PositionHeap < DefaultSymbolType > & suffixTrie, const string::String & string ) {
return dispatch ( suffixTrie, string.getData ( ) );
}
auto PositionHeapFactorsLinearString = PositionHeapFactors::RegistratorWrapper < std::set < unsigned >, string::LinearString < > > ( PositionHeapFactors::query );
} /* namespace query */
} /* namespace stringology */
/*
* PositionHeapFactors.h
*
* Created on: 2. 1. 2017
* Author: Jan Travnicek
*/
#ifndef POSITION_HEAP_FACTORS_H_
#define POSITION_HEAP_FACTORS_H_
#include <indexes/PositionHeap.h>
#include <string/String.h>
#include <string/LinearString.h>
#include <core/multipleDispatch.hpp>
#include <global/GlobalData.h>
namespace stringology {
namespace query {
/**
* Query position heap for given string.
*
* Source: Position heaps: A simple and dynamic text indexing data structure
* Andrzej Ehrenfeucht, Ross M. McConnell, Nissa Osheim, Sung-Whan Woo
*
*/
class PositionHeapFactors : public std::SingleDispatchFirstStaticParam < PositionHeapFactors, std::set < unsigned >, const indexes::PositionHeap < DefaultSymbolType > &, const string::StringBase & > {
template < class SymbolType >
static void accumulateResult ( const std::trie < SymbolType, unsigned > & trie, std::set < unsigned > & res, unsigned indexedStringSize ) {
res.insert ( indexedStringSize - trie.getData ( ) );
for ( const std::pair < SymbolType, std::trie < SymbolType, unsigned > > & child : trie.getChildren ( ) ) {
accumulateResult ( child.second, res, indexedStringSize );
}
}
template < class SymbolType >
static bool checkOcc ( const string::LinearString < SymbolType > & needle, unsigned validSymbols, const std::vector < SymbolType > & haystack, unsigned haystackPosition ) {
if ( haystackPosition + needle.getContent ( ).size ( ) > haystack.size ( ) )
return false;
for ( unsigned i = validSymbols; i < needle.getContent ( ).size ( ); i++ ) {
if ( needle.getContent ( ) [ i ] != haystack [ haystackPosition + i ] )
return false;
}
return true;
}
public:
/**
* Query a suffix trie
* @param suffix trie to query
* @param string string to query by
* @return occurences of factors
*/
static std::set < unsigned > query ( const indexes::PositionHeap < DefaultSymbolType > & positionHeap, const string::String & string );
template < class SymbolType >
static std::set < unsigned > query ( const indexes::PositionHeap < SymbolType > & positionHeap, const string::LinearString < SymbolType > & string );
};
template < class SymbolType >
std::set < unsigned > PositionHeapFactors::query ( const indexes::PositionHeap < SymbolType > & positionHeap, const string::LinearString < SymbolType > & string ) {
std::set < unsigned > res;
const std::trie < SymbolType, unsigned > * node = & positionHeap.getRoot ( );
unsigned depth = 0;
unsigned indexedStringSize = positionHeap.getString ( ).size ( );
if(common::GlobalData::verbose)
std::clog << "on path possible occ (raw, string index): (" << node->getData ( ) << ", " << indexedStringSize - node->getData ( ) << ")";
if ( checkOcc ( string, depth, positionHeap.getString ( ), indexedStringSize - node->getData ( ) ) )
res.insert ( indexedStringSize - node->getData ( ) );
for ( const SymbolType & symbol : string.getContent ( ) ) {
auto iter = node->getChildren ( ).find ( symbol );
if ( iter == node->getChildren ( ).end ( ) )
return res;
depth++;
node = & iter->second;
if(common::GlobalData::verbose)
std::clog << "on path possible occ (raw, string index): (" << node->getData ( ) << ", " << indexedStringSize - node->getData ( ) << ")";
if ( checkOcc ( string, depth, positionHeap.getString ( ), indexedStringSize - node->getData ( ) ) )
res.insert ( indexedStringSize - node->getData ( ) );
}
for ( const std::pair < SymbolType, std::trie < SymbolType, unsigned > > & child : node->getChildren ( ) )
accumulateResult ( child.second, res, indexedStringSize );
return res;
}
} /* namespace query */
} /* namespace stringology */
#endif /* POSITION_HEAP_FACTORS_H_ */
......@@ -17,6 +17,7 @@
 
#include <stringology/query/SuffixTrieFactors.h>
#include <stringology/query/SuffixArrayFactors.h>
#include <stringology/query/PositionHeapFactors.h>
 
int main ( int argc, char * argv[] ) {
try {
......@@ -28,6 +29,7 @@ int main ( int argc, char * argv[] ) {
std::vector < std::string > allowed;
allowed.push_back ( "suffixTrieFactors" );
allowed.push_back ( "suffixArrayFactors" );
allowed.push_back ( "positionHeapFactors" );
TCLAP::ValuesConstraint < std::string > allowedVals ( allowed );
 
TCLAP::ValueArg < std::string > query ( "q", "query", "Query index", false, "exactFactorMatch", & allowedVals );
......@@ -80,6 +82,19 @@ int main ( int argc, char * argv[] ) {
measurements::end ( );
measurements::start ( "Output write", measurements::Type::AUXILIARY );
 
alib::XmlDataFactory::toStdout ( res );
} else if ( query.getValue ( ) == "positionHeapFactors" ) {
indexes::PositionHeap < > positionHeap = alib::XmlDataFactory::fromTokens < indexes::PositionHeap < > > ( sax::FromXMLParserHelper::parseInput ( indexInput ) );
string::String pattern = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, patternInput).front ( ) ) );
measurements::end ( );
measurements::start ( "Algorithm", measurements::Type::MAIN );
std::set < unsigned > res = stringology::query::PositionHeapFactors::query ( positionHeap, pattern );
measurements::end ( );
measurements::start ( "Output write", measurements::Type::AUXILIARY );
alib::XmlDataFactory::toStdout ( res );
} else {
throw exception::CommonException ( "Invalid algorithm" );
......
......@@ -212,6 +212,7 @@ function runTest {
clearResults
}
 
runTest "Position Heap Factors" "./astringology2 -a positionHeap -s \"\$SUBJECT_FILE\" | ./aquery2 -q positionHeapFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size"
runTest "Suffix Array Factors" "./astringology2 -a suffixArray -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixArrayFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size"
runTest "Suffix Trie Factors" "./astringology2 -a suffixTrie -s \"\$SUBJECT_FILE\" | ./aquery2 -q suffixTrieFactors -p \"\$PATTERN_FILE\" | ./astat2 -p size"
runTest "Exact Boyer Moore Horspool" "./astringology2 -a boyerMooreHorspool -s \"\$SUBJECT_FILE\" -p <(./aaccess2 --string alphabet -o add -i \"\$PATTERN_FILE\" -a <(./aaccess2 --string alphabet -o get -i \"\$SUBJECT_FILE\")) | ./astat2 -p size"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment