Skip to content
Snippets Groups Projects
Commit be25da0d authored by Jan Trávníček's avatar Jan Trávníček
Browse files

add boyer moore horspool

parent c20d03fb
No related branches found
No related tags found
No related merge requests found
......@@ -15,127 +15,147 @@
#include <container/ObjectsSet.h>
 
#include <arbology/exact/ExactSubtreeMatch.h>
#include <arbology/exact/BoyerMooreHorspool.h>
#include <arbology/exact/ExactSubtreeMatchingAutomaton.h>
#include <chrono>
 
int main(int argc, char* argv[]) {
int main ( int argc, char * argv[] ) {
try {
TCLAP::CmdLine cmd("Arbology algorithm access binary", ' ', "0.01");
TCLAP::CmdLine cmd ( "Arbology algorithm access binary", ' ', "0.01" );
 
std::vector<std::string> allowed;
allowed.push_back("exactSubtreeMatch");
allowed.push_back("exactSubtreeMatchingAutomaton");
TCLAP::ValuesConstraint<std::string> allowedVals( allowed );
std::vector < std::string > allowed;
allowed.push_back ( "exactSubtreeMatch" );
allowed.push_back ( "boyerMooreHorspool" );
allowed.push_back ( "exactSubtreeMatchingAutomaton" );
TCLAP::ValuesConstraint < std::string > allowedVals ( allowed );
 
TCLAP::ValueArg<std::string> algorithm( "a", "algorithm", "Execute algorithm", false, "exactSubtreeMatch", &allowedVals);
cmd.add(algorithm);
TCLAP::ValueArg < std::string > algorithm ( "a", "algorithm", "Execute algorithm", false, "exactSubtreeMatch", & allowedVals );
cmd.add ( algorithm );
 
TCLAP::MultiArg<std::string> subject( "s", "subject", "Subject tree from file", false, "file");
cmd.add( subject );
TCLAP::MultiArg < std::string > subject ( "s", "subject", "Subject tree from file", false, "file" );
cmd.add ( subject );
 
TCLAP::MultiArg<std::string> pattern( "p", "pattern", "Pattern tree from file", false, "file");
cmd.add( pattern );
TCLAP::MultiArg < std::string > pattern ( "p", "pattern", "Pattern tree from file", false, "file" );
cmd.add ( pattern );
 
TCLAP::SwitchArg measure( "m", "measure", "Measure times", false);
cmd.add( measure );
TCLAP::SwitchArg measure ( "m", "measure", "Measure times", false );
cmd.add ( measure );
 
TCLAP::SwitchArg verbose( "v", "verbose", "Be verbose", false);
cmd.add( verbose );
TCLAP::SwitchArg verbose ( "v", "verbose", "Be verbose", false );
cmd.add ( verbose );
 
cmd.parse(argc,argv);
cmd.parse ( argc, argv );
 
int needPattern = 0;
int needSubject = 0;
if( algorithm.getValue() == "exactSubtreeMatch") {
if ( algorithm.getValue ( ) == "exactSubtreeMatch" ) {
needPattern = needSubject = 1;
} else if( algorithm.getValue() == "exactSubtreeMatchingAutomaton") {
} else if ( algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" ) {
needPattern = 1;
} else {
}
 
std::chrono::measurements::start("Overal", std::chrono::measurements::Type::OVERALL);
std::chrono::measurements::start("Input read", std::chrono::measurements::Type::AUXILARY);
std::deque<std::deque<sax::Token>> subjectTokens;
if(subject.isSet()) {
for(const std::string& fileName : subject.getValue()) {
std::deque<sax::Token> tmp;
if(fileName == "-") {
sax::SaxParseInterface::parseStdin(tmp);
} else {
sax::SaxParseInterface::parseFile(fileName, tmp);
}
subjectTokens.emplace_back(std::move(tmp));
std::chrono::measurements::start ( "Overal", std::chrono::measurements::Type::OVERALL );
std::chrono::measurements::start ( "Input read", std::chrono::measurements::Type::AUXILARY );
std::deque < std::deque < sax::Token > > subjectTokens;
if ( subject.isSet ( ) ) {
for ( const std::string & fileName : subject.getValue ( ) ) {
std::deque < sax::Token > tmp;
if ( fileName == "-" )
sax::SaxParseInterface::parseStdin ( tmp );
else
sax::SaxParseInterface::parseFile ( fileName, tmp );
subjectTokens.emplace_back ( std::move ( tmp ) );
}
} else if(needSubject) {
std::deque<sax::Token> tmp;
sax::SaxParseInterface::parseStdin(tmp);
subjectTokens.emplace_back(std::move(tmp));
} else if ( needSubject ) {
std::deque < sax::Token > tmp;
sax::SaxParseInterface::parseStdin ( tmp );
subjectTokens.emplace_back ( std::move ( tmp ) );
}
 
std::deque<std::deque<sax::Token>> patternTokens;
if(pattern.isSet()) {
for(const std::string& fileName : pattern.getValue()) {
std::deque<sax::Token> tmp;
if(fileName == "-") {
sax::SaxParseInterface::parseStdin(tmp);
} else {
sax::SaxParseInterface::parseFile(fileName, tmp);
}
patternTokens.emplace_back(std::move(tmp));
std::deque < std::deque < sax::Token > > patternTokens;
if ( pattern.isSet ( ) ) {
for ( const std::string & fileName : pattern.getValue ( ) ) {
std::deque < sax::Token > tmp;
if ( fileName == "-" )
sax::SaxParseInterface::parseStdin ( tmp );
else
sax::SaxParseInterface::parseFile ( fileName, tmp );
patternTokens.emplace_back ( std::move ( tmp ) );
}
} else if(needPattern) {
std::deque<sax::Token> tmp;
sax::SaxParseInterface::parseStdin(tmp);
patternTokens.emplace_back(std::move(tmp));
} else if ( needPattern ) {
std::deque < sax::Token > tmp;
sax::SaxParseInterface::parseStdin ( tmp );
patternTokens.emplace_back ( std::move ( tmp ) );
}
 
if( algorithm.getValue() == "exactSubtreeMatch") {
tree::Tree subject = alib::XmlDataFactory::fromTokens<tree::Tree>(subjectTokens.front());
tree::Tree pattern = alib::XmlDataFactory::fromTokens<tree::Tree>(patternTokens.front());
if ( algorithm.getValue ( ) == "exactSubtreeMatch" ) {
tree::Tree subject = alib::XmlDataFactory::fromTokens < tree::Tree > ( subjectTokens.front ( ) );
tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) );
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Algorithm", std::chrono::measurements::Type::MAIN );
std::set < unsigned > res = arbology::exact::ExactSubtreeMatch::match ( subject, pattern );
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY );
alib::XmlDataFactory::toStdout ( res );
} else if ( algorithm.getValue ( ) == "boyerMooreHorspool" ) {
tree::Tree subject = alib::XmlDataFactory::fromTokens < tree::Tree > ( subjectTokens.front ( ) );
tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) );
 
std::chrono::measurements::end();
std::chrono::measurements::start("Algorithm", std::chrono::measurements::Type::MAIN);
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Algorithm", std::chrono::measurements::Type::MAIN );
 
std::set<unsigned> res = arbology::exact::ExactSubtreeMatch::match(subject, pattern);
std::set < unsigned > res = arbology::exact::BoyerMooreHorspool::match ( subject, pattern );
 
std::chrono::measurements::end();
std::chrono::measurements::start("Output write", std::chrono::measurements::Type::AUXILARY);
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY );
 
alib::XmlDataFactory::toStdout(res);
} else if( algorithm.getValue() == "exactSubtreeMatchingAutomaton") {
tree::Tree pattern = alib::XmlDataFactory::fromTokens<tree::Tree>(patternTokens.front());
alib::XmlDataFactory::toStdout ( res );
} else if ( algorithm.getValue ( ) == "exactSubtreeMatchingAutomaton" ) {
tree::Tree pattern = alib::XmlDataFactory::fromTokens < tree::Tree > ( patternTokens.front ( ) );
 
std::chrono::measurements::end();
std::chrono::measurements::start("Algorithm", std::chrono::measurements::Type::MAIN);
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Algorithm", std::chrono::measurements::Type::MAIN );
 
automaton::Automaton res = arbology::exact::ExactSubtreeMatchingAutomaton::construct(pattern);
automaton::Automaton res = arbology::exact::ExactSubtreeMatchingAutomaton::construct ( pattern );
 
std::chrono::measurements::end();
std::chrono::measurements::start("Output write", std::chrono::measurements::Type::AUXILARY);
std::chrono::measurements::end ( );
std::chrono::measurements::start ( "Output write", std::chrono::measurements::Type::AUXILARY );
 
alib::XmlDataFactory::toStdout(res);
alib::XmlDataFactory::toStdout ( res );
} else {
throw exception::AlibException( "Invalid algorithm" );
throw exception::AlibException ( "Invalid algorithm" );
return 1;
}
 
std::chrono::measurements::end();
std::chrono::measurements::end();
std::chrono::measurements::end ( );
std::chrono::measurements::end ( );
 
if(measure.getValue()) std::clog << std::chrono::measurements::results() << std::endl;
if ( measure.getValue ( ) ) std::clog << std::chrono::measurements::results ( ) << std::endl;
 
return 0;
} catch( const exception::AlibException & exception ) {
alib::XmlDataFactory::toStdout( exception );
} catch ( const exception::AlibException & exception ) {
alib::XmlDataFactory::toStdout ( exception );
return 1;
} catch(const TCLAP::ArgException& exception) {
std::cout << exception.error() << std::endl;
} catch ( const TCLAP::ArgException & exception ) {
std::cout << exception.error ( ) << std::endl;
return 2;
} catch (const std::exception& exception) {
std::cerr << "Exception caught: " << exception.what() << std::endl;
} catch ( const std::exception & exception ) {
std::cerr << "Exception caught: " << exception.what ( ) << std::endl;
return 3;
} catch(...) {
} catch ( ... ) {
std::cerr << "Unknown exception caught." << std::endl;
return 127;
}
......
/*
* BadCharacterShiftTable.cpp
*
* Created on: 5. 11. 2014
* Author: Radomir Polach, Tomas Pecka, Jan Travnicek
*/
#include "BadCharacterShiftTable.h"
#include <exception/AlibException.h>
#include <tree/ranked/PrefixRankedBarPattern.h>
namespace arbology {
namespace exact {
std::map < alphabet::RankedSymbol, size_t > BadCharacterShiftTable::bcs ( const std::set < alphabet::RankedSymbol > & alphabet, const tree::RankedTreeWrapper & pattern ) {
return getInstance ( ).dispatch ( alphabet, pattern.getData ( ) );
}
std::map < alphabet::RankedSymbol, size_t > BadCharacterShiftTable::bcs ( const tree::RankedTreeWrapper & pattern ) {
return bcs ( pattern.getAlphabet ( ), pattern );
}
std::map < alphabet::RankedSymbol, size_t > BadCharacterShiftTable::bcs ( const std::set < alphabet::RankedSymbol > & alphabet, const tree::PrefixRankedBarPattern & pattern ) {
std::map < alphabet::RankedSymbol, size_t > bcs;
// initialisation of bcs table to the size of the pattern
for ( const alphabet::RankedSymbol & symbol : alphabet ) {
if ( ( symbol == pattern.getSubtreeWildcard ( ) ) || ( symbol.getSymbol ( ) == pattern.getVariablesBarSymbol ( ) ) ) continue;
bcs.insert ( std::make_pair ( symbol, pattern.getContent ( ).size ( ) ) );
}
// find the distance between the end of the pattern and the index
// of the last symbol representing the variable
int lastSOffset = pattern.getContent ( ).size ( );
for ( unsigned i = 0; i < pattern.getContent ( ).size ( ); i++ )
if ( pattern.getContent ( )[i] == pattern.getSubtreeWildcard ( ) )
lastSOffset = pattern.getContent ( ).size ( ) - i - 1;
// limit the shift by occurrence of the last variable
for ( const alphabet::RankedSymbol & symbol : alphabet ) {
if ( ( symbol == pattern.getSubtreeWildcard ( ) ) || ( symbol.getSymbol ( ) == pattern.getVariablesBarSymbol ( ) ) ) continue;
size_t tmp = lastSOffset;
if ( symbol.getSymbol ( ) != pattern.getBarSymbol ( ) )
// size of the smallest subtree containing given terminal depend
// on the arity of the terminal
tmp += symbol.getRank ( ).getData ( ) * 2;
else if ( tmp >= 2 )
// bar symbols match the variable bar which is one symbol after
// the last variable, conditioned because of the case S S| where
// the -1 would cause shift by 0 -- illegal
tmp -= 1;
if ( bcs[symbol] > tmp )
bcs[symbol] = tmp;
}
// limit the shift by position of symbols within the pattern
for ( unsigned i = 0; i < pattern.getContent ( ).size ( ) - 1; i++ ) // last symbol is not concerned
if ( ( pattern.getContent ( )[i] != pattern.getSubtreeWildcard ( ) ) && ( pattern.getContent ( )[i].getSymbol ( ) != pattern.getVariablesBarSymbol ( ) ) ) {
size_t tmp = pattern.getContent ( ).size ( ) - i - 1;
if ( bcs[pattern.getContent ( )[i]] > tmp )
bcs[pattern.getContent ( )[i]] = tmp;
}
return bcs;
}
auto BadCharacterShiftTablePrefixRankedBarPattern = BadCharacterShiftTable::RegistratorWrapper < std::map < alphabet::RankedSymbol, size_t >, tree::PrefixRankedBarPattern > ( BadCharacterShiftTable::getInstance ( ), BadCharacterShiftTable::bcs );
std::map < alphabet::RankedSymbol, size_t > BadCharacterShiftTable::bcs ( const tree::PrefixRankedBarPattern & pattern ) {
return bcs ( pattern.getAlphabet ( ), pattern );
}
} /* namespace exact */
} /* namespace arbology */
/*
* BadCharacterShiftTable.h
*
* Created on: 5. 11. 2014
* Author: Jan Travnicek
*/
#ifndef _ARBOLOGY_BAD_CHARACTER_SHIFT_TABLE_H_
#define _ARBOLOGY_BAD_CHARACTER_SHIFT_TABLE_H_
#include <tree/RankedTreeWrapper.h>
#include <tree/TreeFeatures.h>
#include <common/multipleDispatch.hpp>
#include <alphabet/RankedSymbol.h>
#include <set>
#include <map>
namespace arbology {
namespace exact {
/**
* Computation of BCS table for BMH from MI(E+\eps)-EVY course 2014
* To get rid of zeros in BCS table we ignore last haystack character
*/
class BadCharacterShiftTable : public std::SingleDispatchFirstStaticParam < std::map < alphabet::RankedSymbol, size_t >, const std::set < alphabet::RankedSymbol > &, tree::RankedTreeBase > {
public:
/**
* Search for pattern in linear string.
* @return set set of occurences
*/
static std::map < alphabet::RankedSymbol, size_t > bcs ( const tree::RankedTreeWrapper & pattern );
/**
* Search for pattern in linear string.
* @return set set of occurences
*/
static std::map < alphabet::RankedSymbol, size_t > bcs ( const std::set < alphabet::RankedSymbol > & alphabet, const tree::RankedTreeWrapper & pattern );
static std::map < alphabet::RankedSymbol, size_t > bcs ( const tree::PrefixRankedBarPattern & pattern );
static std::map < alphabet::RankedSymbol, size_t > bcs ( const std::set < alphabet::RankedSymbol > & alphabet, const tree::PrefixRankedBarPattern & pattern );
static BadCharacterShiftTable & getInstance ( ) {
static BadCharacterShiftTable res;
return res;
}
};
} /* namespace exact */
} /* namespace arbology */
#endif /* _ARBOLOGY_BAD_CHARACTER_SHIFT_TABLE_H_ */
/*
* BoyerMooreHorspool.cpp
*
* Created on: 5. 11. 2014
* Author: Jan Travnicek
*/
#include "BoyerMooreHorspool.h"
#include "BadCharacterShiftTable.h"
#include "SubtreeJumpTable.h"
#include <exception/AlibException.h>
#include <tree/Tree.h>
#include <tree/ranked/PrefixRankedBarTree.h>
#include <tree/ranked/PrefixRankedBarPattern.h>
#include <alphabet/RankedSymbol.h>
#include <map>
namespace arbology {
namespace exact {
std::set < unsigned > BoyerMooreHorspool::match ( const tree::Tree & subject, const tree::Tree & pattern ) {
return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) );
}
std::set < unsigned > BoyerMooreHorspool::match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern ) {
std::set < unsigned > occ;
std::map < alphabet::RankedSymbol, size_t > bcs = BadCharacterShiftTable::bcs ( subject.getAlphabet ( ), pattern );
std::vector < int > subjectSubtreeJumpTable = SubtreeJumpTable::compute ( subject );
// index to the subject
unsigned i = 0;
// main loop of the algorithm over all possible indexes where the pattern can start
while ( i + pattern.getContent ( ).size ( ) <= subject.getContent ( ).size ( ) ) {
// index to the pattern
unsigned j = pattern.getContent ( ).size ( ) - 1;
// offset to the subject
unsigned offset = i + j;
while ( ( j > 0 ) && ( offset > 0 ) ) {
if ( subject.getContent ( )[offset] == pattern.getContent ( )[j] ) {
// match of symbol
offset = offset - 1;
j = j - 1;
} else if ( ( pattern.getContent ( )[j].getSymbol ( ) == pattern.getVariablesBarSymbol ( ) ) && ( subject.getContent ( )[offset].getSymbol ( ) == pattern.getBarSymbol ( ) ) ) {
// match of variable with subtree
offset = subjectSubtreeJumpTable[offset];
j = j - 2;
} else {
break;
}
}
// match was found
if ( j == 0 ) occ.insert ( offset );
// shift heristics
i += bcs[subject.getContent ( )[i + pattern.getContent ( ).size ( ) - 1]];
}
return occ;
}
auto BoyerMooreHorpoolPrefixRankedBarTreePrefixRankedBarPattern = BoyerMooreHorspool::RegistratorWrapper < std::set < unsigned >, tree::PrefixRankedBarTree, tree::PrefixRankedBarPattern > ( BoyerMooreHorspool::getInstance ( ), BoyerMooreHorspool::match );
} /* namespace exact */
} /* namespace arbology */
/*
* BoyerMooreHorspool.h
*
* Created on: 5. 11. 2014
* Author: Jan Travnicek
*/
#ifndef _ARBOLOGY_BOYER_MOORE_HORSPOOL_H_
#define _ARBOLOGY_BOYER_MOORE_HORSPOOL_H_
#include <set>
#include <common/multipleDispatch.hpp>
#include <tree/TreeFeatures.h>
namespace arbology {
namespace exact {
/**
* Implementation of BMH for MI(E+\eps)-EVY course 2014
* To get rid of zeros in BCS table we ignore last haystack character
*/
class BoyerMooreHorspool : public std::DoubleDispatch < std::set < unsigned >, tree::TreeBase, tree::TreeBase > {
public:
/**
* Search for pattern in linear string.
* @return set set of occurences
*/
static std::set < unsigned > match ( const tree::Tree & subject, const tree::Tree & pattern );
static std::set < unsigned > match ( const tree::PrefixRankedBarTree & subject, const tree::PrefixRankedBarPattern & pattern );
static BoyerMooreHorspool & getInstance ( ) {
static BoyerMooreHorspool res;
return res;
}
};
} /* namespace exact */
} /* namespace arbology */
#endif /* _ARBOLOGY_BOYER_MOORE_HORSPOOL_H_ */
/*
* SubtreeJumpTable.cpp
*
* Created on: 5. 11. 2014
* Author: Jan Travnicek
*/
#include "SubtreeJumpTable.h"
#include <exception/AlibException.h>
#include <tree/ranked/PrefixRankedBarTree.h>
#include <tree/ranked/PrefixRankedBarPattern.h>
#include <alphabet/RankedSymbol.h>
#include <map>
namespace arbology {
namespace exact {
std::vector < int > SubtreeJumpTable::compute ( const tree::RankedTreeWrapper & subject ) {
return getInstance ( ).dispatch ( subject.getData ( ) );
}
std::vector < int > SubtreeJumpTable::compute ( const tree::PrefixRankedBarTree & subject ) {
std::vector < int > res;
buildDataPointers ( res, subject, 0 );
return res;
}
/**
* used to compute subtree jump table.
* @param begin - index of a root node of a complete subtree to process
* @return index, increased by one, of the last node in the subtree starting at index begin
*/
int SubtreeJumpTable::buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedBarTree & subject, int begin ) {
res.push_back ( 0 );
int index = begin + 1;
if ( subject.getContent ( )[begin].getSymbol ( ) != subject.getBarSymbol ( ) )
for ( unsigned i = 0; i < subject.getContent ( )[begin].getRank ( ).getData ( ); i++ )
index = buildDataPointers ( res, subject, index );
index++;
res[begin] = index;
res.push_back ( begin - 1 );
return index;
}
auto SubtreeSizesPrefixRankedBarTree = SubtreeJumpTable::RegistratorWrapper < std::vector < int >, tree::PrefixRankedBarTree > ( SubtreeJumpTable::getInstance ( ), SubtreeJumpTable::compute );
} /* namespace exact */
} /* namespace arbology */
/*
* SubtreeJumpTable.h
*
* Created on: 5. 11. 2014
* Author: Jan Travnicek
*/
#ifndef _SUBTREE_JUMP_TABLE_H_
#define _SUBTREE_JUMP_TABLE_H_
#include <tree/RankedTreeWrapper.h>
#include <tree/TreeFeatures.h>
#include <common/multipleDispatch.hpp>
#include <alphabet/RankedSymbol.h>
#include <vector>
namespace arbology {
namespace exact {
class SubtreeJumpTable : public std::SingleDispatch < std::vector < int >, tree::RankedTreeBase > {
public:
static std::vector < int > compute ( const tree::RankedTreeWrapper & subject );
static std::vector < int > compute ( const tree::PrefixRankedBarTree & subject );
static int buildDataPointers ( std::vector < int > & res, const tree::PrefixRankedBarTree & subject, int begin );
static SubtreeJumpTable & getInstance ( ) {
static SubtreeJumpTable res;
return res;
}
};
} /* namespace exact */
} /* namespace arbology */
#endif /* _SUBTREE_JUMP_TABLE_H_ */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment