From b3695643e1336644d1d89b2d5ff07e79df267aad Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Mon, 6 Feb 2017 10:20:43 +0100 Subject: [PATCH] basic position heap data struct and construct algo --- .../indexing/PositionHeapNaive.cpp | 24 ++ .../stringology/indexing/PositionHeapNaive.h | 64 +++++ alib2data/src/indexes/PositionHeap.cpp | 14 ++ alib2data/src/indexes/PositionHeap.h | 236 ++++++++++++++++++ astringology2/src/astringology.cpp | 14 ++ 5 files changed, 352 insertions(+) create mode 100644 alib2algo/src/stringology/indexing/PositionHeapNaive.cpp create mode 100644 alib2algo/src/stringology/indexing/PositionHeapNaive.h create mode 100644 alib2data/src/indexes/PositionHeap.cpp create mode 100644 alib2data/src/indexes/PositionHeap.h diff --git a/alib2algo/src/stringology/indexing/PositionHeapNaive.cpp b/alib2algo/src/stringology/indexing/PositionHeapNaive.cpp new file mode 100644 index 0000000000..2a714f6ef9 --- /dev/null +++ b/alib2algo/src/stringology/indexing/PositionHeapNaive.cpp @@ -0,0 +1,24 @@ +/* + * PositionHeapNaive.cpp + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#include "PositionHeapNaive.h" + +#include <string/LinearString.h> + +namespace stringology { + +namespace indexing { + +indexes::PositionHeap < DefaultSymbolType > PositionHeapNaive::construct ( const string::String & string ) { + return dispatch ( string.getData ( ) ); +} + +auto PositionHeapNaiveLinearString = PositionHeapNaive::RegistratorWrapper < indexes::PositionHeap < DefaultSymbolType >, string::LinearString < > > ( PositionHeapNaive::construct ); + +} /* namespace indexing */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/indexing/PositionHeapNaive.h b/alib2algo/src/stringology/indexing/PositionHeapNaive.h new file mode 100644 index 0000000000..71d40b0e53 --- /dev/null +++ b/alib2algo/src/stringology/indexing/PositionHeapNaive.h @@ -0,0 +1,64 @@ +/* + * PositionHeapNaive.h + * + * Created on: 6. 2. 2017 + * Author: Jan Travnicek + */ + +#ifndef POSITION_HEAP_NAIVE_H_ +#define POSITION_HEAP_NAIVE_H_ + +#include <indexes/PositionHeap.h> +#include <string/String.h> +#include <string/LinearString.h> +#include <core/multipleDispatch.hpp> +#include <exception/CommonException.h> + +namespace stringology { + +namespace indexing { + +/** + * Constructs a position heap for given string. + */ + +class PositionHeapNaive : public std::SingleDispatch < PositionHeapNaive, indexes::PositionHeap < DefaultSymbolType >, const string::StringBase & > { +public: + /** + * Creates suffix trie + * @param string string to construct suffix trie for + * @return automaton + */ + static indexes::PositionHeap < DefaultSymbolType > construct ( const string::String & string ); + + template < class SymbolType > + static indexes::PositionHeap < SymbolType > construct ( const string::LinearString < SymbolType > & string ); + +}; + +template < class SymbolType > +indexes::PositionHeap < SymbolType > PositionHeapNaive::construct ( const string::LinearString < SymbolType > & w ) { + if ( w.getContent ( ).size ( ) == 0 ) + throw exception::CommonException ( "Position heap can't index empty string" ); + + std::trie < SymbolType, unsigned > trie ( 1 ); + + for ( unsigned i = w.getContent ( ).size ( ) - 1; i > 0; i-- ) { + unsigned k = i - 1; + std::trie < SymbolType, unsigned > * n = & trie; + + while ( k < w.getContent ( ).size ( ) && n->getChildren ( ).count ( w.getContent ( )[k] ) ) + n = & n->getChildren ( ).find ( w.getContent ( )[k++] )->second; + + unsigned node = w.getContent ( ).size ( ) - i + 1; + n = & n->getChildren ( ).insert ( std::make_pair ( w.getContent ( )[k], std::trie < SymbolType, unsigned > ( node ) ) ).first->second; + } + + return indexes::PositionHeap < SymbolType > ( w.getAlphabet ( ), trie ); +} + +} /* namespace indexing */ + +} /* namespace stringology */ + +#endif /* POSITION_HEAP_NAIVE_H_ */ diff --git a/alib2data/src/indexes/PositionHeap.cpp b/alib2data/src/indexes/PositionHeap.cpp new file mode 100644 index 0000000000..9b47d21c39 --- /dev/null +++ b/alib2data/src/indexes/PositionHeap.cpp @@ -0,0 +1,14 @@ +/* + * PositionHeap.cpp + * + * Created on: Nov 23, 2013 + * Author: Jan Travnicek + */ + +#include "PositionHeap.h" + +namespace alib { + +auto positionHeapParserRegister = xmlApi < alib::Object >::ParserRegister < indexes::PositionHeap < > > ( ); + +} /* namespace alib */ diff --git a/alib2data/src/indexes/PositionHeap.h b/alib2data/src/indexes/PositionHeap.h new file mode 100644 index 0000000000..b1468051c7 --- /dev/null +++ b/alib2data/src/indexes/PositionHeap.h @@ -0,0 +1,236 @@ +/* + * PositionHeap.h + * + * Created on: Nov 23, 2013 + * Author: Jan Travnicek + */ + +#ifndef POSITION_HEAP_H_ +#define POSITION_HEAP_H_ + +#include <string> +#include <set> +#include <trie> +#include <iostream> +#include <algorithm> +#include <sstream> + +#include <common/DefaultSymbolType.h> + +#include <core/components.hpp> +#include <exception/CommonException.h> + +#include <object/Object.h> +#include <object/UniqueObject.h> +#include <object/ObjectBase.h> + +#include <sax/FromXMLParserHelper.h> +#include <core/xmlApi.hpp> + +#include <container/ObjectsSet.h> +#include <container/ObjectsTrie.h> + +#include <primitive/Unsigned.h> + +namespace indexes { + +class GeneralAlphabet; + +/** + * Represents regular expression parsed from the XML. Regular expression is stored + * as a tree of RegExpElement. + */ +template < class SymbolType = DefaultSymbolType > +class PositionHeap : public alib::ObjectBase, public std::Components < PositionHeap < SymbolType >, SymbolType, std::tuple < GeneralAlphabet >, std::tuple < > > { +protected: + std::trie < SymbolType, unsigned > m_trie; + +public: + /** + * @copydoc PositionHeap::clone() const + */ + virtual ObjectBase * clone ( ) const; + + /** + * @copydoc PositionHeap::plunder() const + */ + virtual ObjectBase * plunder ( ) &&; + + explicit PositionHeap ( std::set < SymbolType > edgeAlphabet, std::trie < SymbolType, unsigned > trie ); + explicit PositionHeap ( std::trie < SymbolType, unsigned > trie ); + + void checkTrie ( const std::trie < SymbolType, unsigned > & trie ); + + /** + * @return Root node of the trie + */ + const std::trie < SymbolType, unsigned > & getRoot ( ) const; + + const std::set < SymbolType > & getAlphabet ( ) const { + return this->template accessComponent < GeneralAlphabet > ( ).get ( ); + } + + /** + * Sets the root node of the regular expression tree + * @param tree root node to set + */ + void setTree ( std::trie < SymbolType, unsigned > tree ); + + /** + * Removes symbol from the alphabet of symbol available in the regular expression + * @param symbol removed symbol from the alphabet + */ + bool removeSymbolFromEdgeAlphabet ( const SymbolType & symbol ) { + return this->template accessComponent < GeneralAlphabet > ( ).remove ( symbol ); + } + + /** + * Prints XML representation of the tree to the output stream. + * @param out output stream to which print the tree + * @param tree tree to print + */ + virtual void operator >>( std::ostream & out ) const; + + virtual int compare ( const ObjectBase & other ) const { + if ( std::type_index ( typeid ( * this ) ) == std::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); + + return std::type_index ( typeid ( * this ) ) - std::type_index ( typeid ( other ) ); + } + + virtual int compare ( const PositionHeap & other ) const; + + virtual explicit operator std::string ( ) const; + + static const std::string & getXmlTagName() { + static std::string xmlTagName = "PositionHeap"; + + return xmlTagName; + } + + static PositionHeap parse ( std::deque < sax::Token >::iterator & input ); + + void compose ( std::deque < sax::Token > & out ) const; + + virtual alib::ObjectBase * inc ( ) &&; +}; + +} /* namespace indexes */ + +namespace indexes { + +template < class SymbolType > +PositionHeap < SymbolType >::PositionHeap ( std::set < SymbolType > edgeAlphabet, std::trie < SymbolType, unsigned > trie ) : std::Components < PositionHeap, SymbolType, std::tuple < GeneralAlphabet >, std::tuple < > > ( std::make_tuple ( std::move ( edgeAlphabet ) ), std::tuple < > ( ) ), m_trie ( std::move ( trie ) ) { + checkTrie ( this->m_trie ); +} + +template < class SymbolType > +PositionHeap < SymbolType >::PositionHeap ( std::trie < SymbolType, unsigned > trie ) : PositionHeap ( computeMinimalEdgeAlphabet ( trie ), trie ) { +} + +template < class SymbolType > +alib::ObjectBase * PositionHeap < SymbolType >::clone ( ) const { + return new PositionHeap ( * this ); +} + +template < class SymbolType > +alib::ObjectBase * PositionHeap < SymbolType >::plunder ( ) && { + return new PositionHeap ( std::move ( * this ) ); +} + +template < class SymbolType > +void PositionHeap < SymbolType >::checkTrie ( const std::trie < SymbolType, unsigned > & trie ) { + for ( const std::pair < const SymbolType, std::trie < SymbolType, unsigned > > & child : trie.getChildren ( ) ) { + if ( ! getAlphabet ( ).count ( child.first ) ) + throw exception::CommonException ( "Symbol " + std::to_string ( child.first ) + "not in the alphabet." ); + checkTrie ( child.second ); + } +} + +template < class SymbolType > +const std::trie < SymbolType, unsigned > & PositionHeap < SymbolType >::getRoot ( ) const { + return m_trie; +} + +template < class SymbolType > +void PositionHeap < SymbolType >::setTree ( std::trie < SymbolType, unsigned > trie ) { + checkTrie ( trie ); + this->m_trie = std::move ( trie ).plunder ( ); +} + +template < class SymbolType > +void PositionHeap < SymbolType >::operator >>( std::ostream & out ) const { + out << "(PositionHeap " << this->m_trie << ")"; +} + +template < class SymbolType > +int PositionHeap < SymbolType >::compare ( const PositionHeap & other ) const { + auto first = std::tie ( getRoot ( ), getAlphabet ( ) ); + auto second = std::tie ( other.getRoot ( ), other.getAlphabet ( ) ); + + static std::compare < decltype ( first ) > comp; + + return comp ( first, second ); +} + +template < class SymbolType > +PositionHeap < SymbolType >::operator std::string ( ) const { + std::stringstream ss; + ss << * this; + return ss.str ( ); +} + +template < class SymbolType > +PositionHeap < SymbolType > PositionHeap < SymbolType >::parse ( std::deque < sax::Token >::iterator & input ) { + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::START_ELEMENT, PositionHeap::getXmlTagName() ); + std::set < SymbolType > edgeAlphabet = alib::xmlApi < std::set < SymbolType > >::parse ( input ); + std::trie < SymbolType, unsigned > root = alib::xmlApi < std::trie < SymbolType, unsigned > >::parse ( input ); + PositionHeap < SymbolType > trie ( std::move ( edgeAlphabet ), std::move ( root ) ); + + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::END_ELEMENT, PositionHeap::getXmlTagName() ); + return trie; +} + +template < class SymbolType > +void PositionHeap < SymbolType >::compose ( std::deque < sax::Token > & out ) const { + out.emplace_back ( PositionHeap::getXmlTagName(), sax::Token::TokenType::START_ELEMENT ); + alib::xmlApi < std::set < SymbolType > >::compose ( out, getAlphabet ( ) ); + alib::xmlApi < std::trie < SymbolType, unsigned > >::compose ( out, getRoot ( ) ); + out.emplace_back ( PositionHeap::getXmlTagName(), sax::Token::TokenType::END_ELEMENT ); +} + +template < class SymbolType > +alib::ObjectBase* PositionHeap < SymbolType >::inc() && { + return new alib::UniqueObject(alib::Object(std::move(*this)), primitive::Integer(0)); +} + +} /* namespace indexes */ + +namespace std { + +template < class SymbolType > +class ComponentConstraint < indexes::PositionHeap < SymbolType >, SymbolType, indexes::GeneralAlphabet > { + + static bool used ( const std::trie < SymbolType, unsigned > & trie, const SymbolType & symbol ) { + for ( const std::pair < const SymbolType, std::trie < SymbolType, unsigned > > & child : trie.getChildren ( ) ) { + if ( symbol == child.first || checkTrie ( trie, child.second ) ) + return true; + } + return false; + } + +public: + static bool used ( const indexes::PositionHeap < SymbolType > & index, const SymbolType & symbol ) { + return used ( index.getRoot ( ), symbol ); + } + + static bool available ( const indexes::PositionHeap < SymbolType > &, const SymbolType & ) { + return true; + } + + static void valid ( const indexes::PositionHeap < SymbolType > &, const SymbolType & ) { + } +}; + +} /* namespace std */ + +#endif /* POSITION_HEAP_H_ */ diff --git a/astringology2/src/astringology.cpp b/astringology2/src/astringology.cpp index 923f019a55..777f125f79 100644 --- a/astringology2/src/astringology.cpp +++ b/astringology2/src/astringology.cpp @@ -35,6 +35,7 @@ #include <stringology/exact/SuffixAutomaton.h> #include <string/properties/BorderArray.h> #include <stringology/indexing/SuffixTrieNaive.h> +#include <stringology/indexing/PositionHeapNaive.h> #include <stringology/indexing/SuffixArrayNaive.h> int main ( int argc, char * argv[] ) { @@ -61,6 +62,7 @@ int main ( int argc, char * argv[] ) { allowed.push_back ( "backwardDAWGMatching" ); allowed.push_back ( "borderArray" ); allowed.push_back ( "suffixTrie" ); + allowed.push_back ( "positionHeap" ); allowed.push_back ( "suffixArray" ); TCLAP::ValuesConstraint < std::string > allowedVals ( allowed ); @@ -291,6 +293,18 @@ int main ( int argc, char * argv[] ) { measurements::start ( "Output write", measurements::Type::AUXILIARY ); alib::XmlDataFactory::toStdout ( suffixTrie ); + } else if ( algorithm.getValue ( ) == "positionHeap" ) { + string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); + + measurements::end ( ); + measurements::start ( "Algorithm", measurements::Type::MAIN ); + + indexes::PositionHeap < DefaultSymbolType > positionHeap = stringology::indexing::PositionHeapNaive::construct ( subject ); + + measurements::end ( ); + measurements::start ( "Output write", measurements::Type::AUXILIARY ); + + alib::XmlDataFactory::toStdout ( positionHeap ); } else if ( algorithm.getValue ( ) == "suffixArray" ) { string::String subject = alib::XmlDataFactory::fromTokens < string::String > ( std::move ( sax::FromXMLParserHelper::parseInput(true, subjectInput).front ( ) ) ); -- GitLab