From 0b9faa977b4b4105f1a7bba69fb0c1c684ef922b Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Fri, 4 May 2018 13:19:03 +0200 Subject: [PATCH] update string indexes --- .../matching/BNDMMatcherConstruction.cpp | 2 +- .../matching/BNDMMatcherConstruction.h | 8 +- .../src/stringology/query/BNDMOccurrences.cpp | 2 +- .../src/stringology/query/BNDMOccurrences.h | 10 +- ...ckwardNondeterministicDAWGMatchingTest.cpp | 4 +- .../src/indexes/stringology/BNDMMatcher.h | 278 --------------- .../indexes/stringology/BitParallelIndex.h | 138 ++++++-- .../{BNDMMatcher.cpp => BitSetIndex.cpp} | 12 +- .../src/indexes/stringology/BitSetIndex.h | 330 ++++++++++++++++++ .../stringology/CompressedBitParallelIndex.h | 8 +- .../stringology/FactorOracleAutomaton.h | 108 +++++- 11 files changed, 559 insertions(+), 341 deletions(-) delete mode 100644 alib2data/src/indexes/stringology/BNDMMatcher.h rename alib2data/src/indexes/stringology/{BNDMMatcher.cpp => BitSetIndex.cpp} (67%) create mode 100644 alib2data/src/indexes/stringology/BitSetIndex.h diff --git a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp index 43ff31c519..0076830005 100644 --- a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp +++ b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.cpp @@ -12,7 +12,7 @@ namespace stringology { namespace matching { -auto BNDMIndexConstructionLinearString = registration::AbstractRegister < BNDMMatcherConstruction, indexes::stringology::BNDMMatcher < >, const string::LinearString < > & > ( BNDMMatcherConstruction::construct ); +auto BNDMIndexConstructionLinearString = registration::AbstractRegister < BNDMMatcherConstruction, indexes::stringology::BitSetIndex < >, const string::LinearString < > & > ( BNDMMatcherConstruction::construct ); } /* namespace matching */ diff --git a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h index 7169903389..a1016c1f86 100644 --- a/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h +++ b/alib2algo/src/stringology/matching/BNDMMatcherConstruction.h @@ -8,7 +8,7 @@ #ifndef BNDM_MATCHER_CONSTRUCTION_H_ #define BNDM_MATCHER_CONSTRUCTION_H_ -#include <indexes/stringology/BNDMMatcher.h> +#include <indexes/stringology/BitSetIndex.h> #include <string/LinearString.h> #include <exception/CommonException.h> @@ -29,12 +29,12 @@ public: * @return automaton */ template < class SymbolType, size_t BitmaskBitCount = 64 > - static indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > construct ( const string::LinearString < SymbolType > & string ); + static indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > construct ( const string::LinearString < SymbolType > & string ); }; template < class SymbolType, size_t BitmaskBitCount > -indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > BNDMMatcherConstruction::construct ( const string::LinearString < SymbolType > & w ) { +indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > BNDMMatcherConstruction::construct ( const string::LinearString < SymbolType > & w ) { size_t bitmaskLength = std::min ( w.getContent ( ).size ( ), BitmaskBitCount ); ext::map < SymbolType, ext::bitset < BitmaskBitCount > > res; @@ -44,7 +44,7 @@ indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > BNDMMatcherCon for ( unsigned i = 0; i < bitmaskLength; ++i ) res [ w.getContent ( ) [ i ] ] [ bitmaskLength - i - 1 ] = true; - return indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > ( w.getAlphabet ( ), res, w.getContent ( ) ); + return indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > ( res, string::LinearString < SymbolType > ( w ) ); } } /* namespace matching */ diff --git a/alib2algo/src/stringology/query/BNDMOccurrences.cpp b/alib2algo/src/stringology/query/BNDMOccurrences.cpp index ac498517cc..af5853363f 100644 --- a/alib2algo/src/stringology/query/BNDMOccurrences.cpp +++ b/alib2algo/src/stringology/query/BNDMOccurrences.cpp @@ -12,7 +12,7 @@ namespace stringology { namespace query { -auto bndmOccurrencesLinearString = registration::AbstractRegister < BNDMOccurrences, ext::set < unsigned >, const indexes::stringology::BNDMMatcher < > &, const string::LinearString < > & > ( BNDMOccurrences::query ); +auto bndmOccurrencesLinearString = registration::AbstractRegister < BNDMOccurrences, ext::set < unsigned >, const indexes::stringology::BitSetIndex < > &, const string::LinearString < > & > ( BNDMOccurrences::query ); } /* namespace query */ diff --git a/alib2algo/src/stringology/query/BNDMOccurrences.h b/alib2algo/src/stringology/query/BNDMOccurrences.h index 1547034d2e..5c8b0134b6 100644 --- a/alib2algo/src/stringology/query/BNDMOccurrences.h +++ b/alib2algo/src/stringology/query/BNDMOccurrences.h @@ -8,7 +8,7 @@ #ifndef BNDM_OCCURRENCES_H_ #define BNDM_OCCURRENCES_H_ -#include <indexes/stringology/BNDMMatcher.h> +#include <indexes/stringology/BitSetIndex.h> #include <string/LinearString.h> #include <global/GlobalData.h> @@ -33,16 +33,16 @@ public: * @return occurences of factors */ template < class SymbolType, size_t BitmaskBitCount > - static ext::set < unsigned > query ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ); + static ext::set < unsigned > query ( const indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ); }; template < class SymbolType, size_t BitmaskBitCount > -ext::set < unsigned > BNDMOccurrences::query ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ) { +ext::set < unsigned > BNDMOccurrences::query ( const indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > & pattern, const string::LinearString < SymbolType > & subject ) { ext::set < unsigned > occ; - size_t patternLength = pattern.getString ( ).size ( ); + size_t patternLength = pattern.getString ( ).getContent ( ).size ( ); size_t subjectLength = subject.getContent ( ).size ( ); size_t posInSubject = 0; size_t bitmaskLength = std::min ( BitmaskBitCount, patternLength ); @@ -73,7 +73,7 @@ ext::set < unsigned > BNDMOccurrences::query ( const indexes::stringology::BNDMM size_t k = bitmaskLength; // out of bitset fallback to naive checking of occurrence here - while ( k < patternLength && pattern.getString ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; + while ( k < patternLength && pattern.getString ( ).getContent ( ).at ( k ) == subject.getContent ( ).at ( posInSubject + k ) ) k++; if ( k == patternLength ) // Yay, there is match!!! diff --git a/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp index ddf8ea7709..864916e0f8 100644 --- a/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp +++ b/alib2algo/test-src/stringology/matching/BackwardNondeterministicDAWGMatchingTest.cpp @@ -40,7 +40,7 @@ void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { for(size_t i = 0; i < subjects.size(); ++i) { indexes::stringology::BitParallelIndex < char > bndmPattern1 = stringology::matching::WideBNDMMatcherConstruction::construct ( string::LinearString < char > ( patterns[i] ) ); - indexes::stringology::BNDMMatcher < char > bndmPattern2 = stringology::matching::BNDMMatcherConstruction::construct ( string::LinearString < char > ( patterns[i] ) ); + indexes::stringology::BitSetIndex < char > bndmPattern2 = stringology::matching::BNDMMatcherConstruction::construct ( string::LinearString < char > ( patterns[i] ) ); ext::set < unsigned > res1 = stringology::query::WideBNDMOccurrences::query ( bndmPattern1, string::LinearString < char > ( subjects[i] ) ); ext::set < unsigned > res2 = stringology::query::BNDMOccurrences::query ( bndmPattern2, string::LinearString < char > ( subjects[i] ) ); CPPUNIT_ASSERT ( res1 == expectedOccs[i] ); @@ -50,7 +50,7 @@ void BackwardNondeterministicDAWGMatchingTest::testBNDM ( ) { auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true); auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject ); - indexes::stringology::BNDMMatcher < > pattern = stringology::matching::BNDMMatcherConstruction::construct ( longPattern ); + indexes::stringology::BitSetIndex < > pattern = stringology::matching::BNDMMatcherConstruction::construct ( longPattern ); ext::set < unsigned > res = stringology::query::BNDMOccurrences::query ( pattern, longSubject ); ext::set < unsigned > ref = stringology::exact::ExactFactorMatch::match ( longSubject, longPattern ); std::cout << "long: " << res << std::endl; diff --git a/alib2data/src/indexes/stringology/BNDMMatcher.h b/alib2data/src/indexes/stringology/BNDMMatcher.h deleted file mode 100644 index ce579acb8d..0000000000 --- a/alib2data/src/indexes/stringology/BNDMMatcher.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * BNDMMatcher.h - * - * Created on: Jan 8, 2017 - * Author: Jan Travnicek - */ - -#ifndef BNDM_MATCHER_H_ -#define BNDM_MATCHER_H_ - -#include <alib/set> -#include <alib/string> -#include <alib/iostream> -#include <sstream> - -#include <common/DefaultSymbolType.h> - -#include <core/components.hpp> -#include <exception/CommonException.h> - -#include <object/UniqueObject.h> -#include <object/ObjectBase.h> - -#include <sax/FromXMLParserHelper.h> -#include <core/xmlApi.hpp> - -#include <container/ObjectsSet.h> -#include <container/ObjectsMap.h> -#include <container/ObjectsVector.h> - -#include <container/xml/ObjectsSet.h> -#include <container/xml/ObjectsMap.h> -#include <container/xml/ObjectsVector.h> -#include <container/xml/ObjectsBitset.h> - -#include <primitive/Bool.h> -#include <primitive/xml/Bool.h> -#include <alib/bitset> - -#include <alphabet/common/SymbolNormalize.h> - -namespace indexes { - -namespace stringology { - -class GeneralAlphabet; - -/** - * Represents regular expression parsed from the XML. Regular expression is stored - * as a tree of RegExpElement. - */ -template < class SymbolType = DefaultSymbolType, size_t BitmaskBitCount = 64 > -class BNDMMatcher final : public object::ObjectBase, public core::Components < BNDMMatcher < SymbolType >, ext::set < SymbolType >, component::Set, GeneralAlphabet > { -protected: - ext::map < SymbolType, ext::bitset < BitmaskBitCount > > m_vectors; - ext::vector < SymbolType > m_string; - -public: - /** - * @copydoc SuffixTrieNode::clone ( ) const & - */ - virtual ObjectBase * clone ( ) const &; - - /** - * @copydoc SuffixTrieNode::clone ( ) const & - */ - virtual ObjectBase * clone ( ) &&; - - explicit BNDMMatcher ( ext::set < SymbolType > alphabet, ext::map < SymbolType, ext::bitset < BitmaskBitCount > > vectors, ext::vector < SymbolType > string ); - - /** - * @return Root node of the trie - */ - const ext::map < SymbolType, ext::bitset < BitmaskBitCount > > & getData ( ) const &; - - ext::map < SymbolType, ext::bitset < BitmaskBitCount > > && getData ( ) &&; - - const ext::vector < SymbolType > & getString ( ) const &; - - ext::vector < SymbolType > && getString ( ) &&; - - const ext::set < SymbolType > & getAlphabet ( ) const & { - return this->template accessComponent < GeneralAlphabet > ( ).get ( ); - } - - ext::set < SymbolType > && getAlphabet ( ) && { - return std::move ( this->template accessComponent < GeneralAlphabet > ( ).get ( ) ); - } - - /** - * Sets the bit vector for given symbol - * @param tree root node to set - */ - void setBitVectorForSymbol ( SymbolType symbol, ext::bitset < BitmaskBitCount > data ); - - /** - * Removes symbol from the alphabet of symbol available in the regular expression - * @param symbol removed symbol from the alphabet - */ - bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { - return this->template accessComponent < GeneralAlphabet > ( ).remove ( symbol ); - } - - /** - * Prints XML representation of the tree to the output stream. - * @param out output stream to which print the tree - * @param tree tree to print - */ - virtual void operator >>( std::ostream & out ) const; - - virtual int compare ( const ObjectBase & other ) const { - if ( ext::type_index ( typeid ( * this ) ) == ext::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); - - return ext::type_index ( typeid ( * this ) ) - ext::type_index ( typeid ( other ) ); - } - - virtual int compare ( const BNDMMatcher & other ) const; - - virtual explicit operator std::string ( ) const; - - virtual object::ObjectBase * inc ( ) &&; -}; - -} /* namespace stringology */ - -} /* namespace indexes */ - -namespace indexes { - -namespace stringology { - -template < class SymbolType, size_t BitmaskBitCount > -BNDMMatcher < SymbolType, BitmaskBitCount >::BNDMMatcher ( ext::set < SymbolType > alphabet, ext::map < SymbolType, ext::bitset < BitmaskBitCount > > vectors, ext::vector < SymbolType > string ) : core::Components < BNDMMatcher, ext::set < SymbolType >, component::Set, GeneralAlphabet > ( std::move ( alphabet ) ), m_vectors ( std::move ( vectors ) ), m_string ( std::move ( string ) ) { -} - -template < class SymbolType, size_t BitmaskBitCount > -object::ObjectBase * BNDMMatcher < SymbolType, BitmaskBitCount >::clone ( ) const & { - return new BNDMMatcher ( * this ); -} - -template < class SymbolType, size_t BitmaskBitCount > -object::ObjectBase * BNDMMatcher < SymbolType, BitmaskBitCount >::clone ( ) && { - return new BNDMMatcher ( std::move ( * this ) ); -} - -template < class SymbolType, size_t BitmaskBitCount > -const ext::map < SymbolType, ext::bitset < BitmaskBitCount > > & BNDMMatcher < SymbolType, BitmaskBitCount >::getData ( ) const & { - return m_vectors; -} - -template < class SymbolType, size_t BitmaskBitCount > -ext::map < SymbolType, ext::bitset < BitmaskBitCount > > && BNDMMatcher < SymbolType, BitmaskBitCount >::getData ( ) && { - return std::move ( m_vectors ); -} - -template < class SymbolType, size_t BitmaskBitCount > -const ext::vector < SymbolType > & BNDMMatcher < SymbolType, BitmaskBitCount >::getString ( ) const & { - return m_string; -} - -template < class SymbolType, size_t BitmaskBitCount > -ext::vector < SymbolType > && BNDMMatcher < SymbolType, BitmaskBitCount >::getString ( ) && { - return std::move ( m_string ); -} - -template < class SymbolType, size_t BitmaskBitCount > -void BNDMMatcher < SymbolType, BitmaskBitCount >::setBitVectorForSymbol ( SymbolType symbol, ext::bitset < BitmaskBitCount > data ) { - this->m_vectors [ symbol ] = std::move ( data ); -} - -template < class SymbolType, size_t BitmaskBitCount > -void BNDMMatcher < SymbolType, BitmaskBitCount >::operator >>( std::ostream & out ) const { - out << "(BNDMMatcher " << this->m_vectors << ")"; -} - -template < class SymbolType, size_t BitmaskBitCount > -int BNDMMatcher < SymbolType, BitmaskBitCount >::compare ( const BNDMMatcher & other ) const { - auto first = ext::tie ( getData ( ), getAlphabet ( ) ); - auto second = ext::tie ( other.getData ( ), other.getAlphabet ( ) ); - - static ext::compare < decltype ( first ) > comp; - - return comp ( first, second ); -} - -template < class SymbolType, size_t BitmaskBitCount > -BNDMMatcher < SymbolType, BitmaskBitCount >::operator std::string ( ) const { - std::stringstream ss; - ss << * this; - return ss.str ( ); -} - -template < class SymbolType, size_t BitmaskBitCount > -object::ObjectBase* BNDMMatcher < SymbolType, BitmaskBitCount >::inc() && { - return new object::UniqueObject(object::Object(std::move(*this)), primitive::Integer(0)); -} - -} /* namespace stringology */ - -} /* namespace indexes */ - -namespace core { - -template < class SymbolType, size_t BitmaskBitCount > -class SetConstraint < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount >, SymbolType, indexes::stringology::GeneralAlphabet > { -public: - static bool used ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & index, const SymbolType & symbol ) { - const ext::map < SymbolType, ext::bitset < BitmaskBitCount > > & content = index.getData ( ); - return content.find( symbol ) != content.end(); - } - - static bool available ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > &, const SymbolType & ) { - return true; - } - - static void valid ( const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > &, const SymbolType & ) { - } -}; - -template < class SymbolType, size_t BitmaskBitCount > -struct normalize < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > > { - static indexes::stringology::BNDMMatcher < DefaultSymbolType, BitmaskBitCount > eval ( indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > && value ) { - ext::set < DefaultSymbolType > alphabet = alphabet::SymbolNormalize::normalizeAlphabet ( std::move ( value ).getAlphabet ( ) ); - - ext::map < DefaultSymbolType, ext::bitset < BitmaskBitCount > > vectors; - for ( std::pair < SymbolType, ext::bitset < BitmaskBitCount > > && vector : ext::make_moveable_map ( std::move ( value ).getData ( ) ) ) - vectors.insert ( std::make_pair ( alphabet::SymbolNormalize::normalizeSymbol ( std::move ( vector.first ) ), std::move ( vector.second ) ) ); - - ext::vector < DefaultSymbolType > string = alphabet::SymbolNormalize::normalizeSymbols ( std::move ( value ).getString ( ) ); - - return indexes::stringology::BNDMMatcher < DefaultSymbolType, BitmaskBitCount > ( std::move ( alphabet ), std::move ( vectors ), std::move ( string ) ); - } -}; - -template < class SymbolType, size_t BitmaskBitCount > -struct xmlApi < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > > { - static indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > parse ( ext::deque < sax::Token >::iterator & input ); - static bool first ( const ext::deque < sax::Token >::const_iterator & input ); - static const std::string & xmlTagName ( ); - static void compose ( ext::deque < sax::Token > & output, const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & data ); -}; - -template < class SymbolType, size_t BitmaskBitCount > -indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > xmlApi < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > >::parse ( ext::deque < sax::Token >::iterator & input ) { - sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); - ext::set < SymbolType > alphabet = core::xmlApi < ext::set < SymbolType > >::parse ( input ); - ext::map < SymbolType, ext::bitset < BitmaskBitCount > > data = core::xmlApi < ext::map < SymbolType, ext::bitset < BitmaskBitCount > > >::parse ( input ); - ext::vector < SymbolType > string = core::xmlApi < ext::vector < SymbolType > >::parse ( input ); - indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > res ( std::move ( alphabet ), std::move ( data ), std::move ( string ) ); - - sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::END_ELEMENT, xmlTagName ( ) ); - return res; -} - -template < class SymbolType, size_t BitmaskBitCount > -bool xmlApi < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > >::first ( const ext::deque < sax::Token >::const_iterator & input ) { - return sax::FromXMLParserHelper::isToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); -} - -template < class SymbolType, size_t BitmaskBitCount > -const std::string & xmlApi < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > >::xmlTagName ( ) { - static std::string xmlTagName = "BNDMMatcher"; - - return xmlTagName; -} - -template < class SymbolType, size_t BitmaskBitCount > -void xmlApi < indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > >::compose ( ext::deque < sax::Token > & output, const indexes::stringology::BNDMMatcher < SymbolType, BitmaskBitCount > & index ) { - output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::START_ELEMENT ); - core::xmlApi < ext::set < SymbolType > >::compose ( output, index.getAlphabet ( ) ); - core::xmlApi < ext::map < SymbolType, ext::bitset < BitmaskBitCount > > >::compose ( output, index.getData ( ) ); - core::xmlApi < ext::vector < SymbolType > >::compose ( output, index.getString ( ) ); - output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::END_ELEMENT ); -} - -} /* namespace core */ - -#endif /* BNDM_MATCHER_H_ */ diff --git a/alib2data/src/indexes/stringology/BitParallelIndex.h b/alib2data/src/indexes/stringology/BitParallelIndex.h index 013f7738b6..d5999f1fe3 100644 --- a/alib2data/src/indexes/stringology/BitParallelIndex.h +++ b/alib2data/src/indexes/stringology/BitParallelIndex.h @@ -1,6 +1,22 @@ /* * BitParallelIndex.h * + * This file is part of Algorithms library toolkit. + * Copyright (C) 2017 Jan Travnicek (jan.travnicek@fit.cvut.cz) + + * Algorithms library toolkit is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + + * Algorithms library toolkit is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with Algorithms library toolkit. If not, see <http://www.gnu.org/licenses/>. + * * Created on: Jan 8, 2017 * Author: Jan Travnicek */ @@ -44,76 +60,124 @@ namespace stringology { class GeneralAlphabet; /** - * Represents regular expression parsed from the XML. Regular expression is stored - * as a tree of RegExpElement. + * \brief Bit parallel string index. Stores a bit vector for each symbol of the alphabet. The bit vector of symbol a contains true on index i if symbol a is on i-th position in the indexed string. The class does not check whether the bit vectors actually represent valid index. + * + * \tparam SymbolType type of symbols of indexed string */ template < class SymbolType = DefaultSymbolType > class BitParallelIndex final : public object::ObjectBase, public core::Components < BitParallelIndex < SymbolType >, ext::set < SymbolType >, component::Set, GeneralAlphabet > { -protected: + /** + * Representation of bit vectors for each symbol of the alphabet. + */ ext::map < SymbolType, ext::vector < bool > > m_vectors; public: /** - * @copydoc SuffixTrieNode::clone ( ) const & + * @copydoc ObjectBase::clone ( ) const & */ - virtual ObjectBase * clone ( ) const &; + virtual ObjectBase * clone ( ) const & override; /** - * @copydoc SuffixTrieNode::clone ( ) const & + * @copydoc ObjectBase::clone ( ) && */ - virtual ObjectBase * clone ( ) &&; + virtual ObjectBase * clone ( ) && override; + /** + * Creates a new instance of the index with concrete alphabet and bit vectors. + * + * \param alphabet the alphabet of indexed string + * \param vectors the bit vectors + */ explicit BitParallelIndex ( ext::set < SymbolType > alphabet, ext::map < SymbolType, ext::vector < bool > > vectors ); /** - * @return Root node of the trie + * Getter of the bit vectors. + * + * @return bit vectors */ const ext::map < SymbolType, ext::vector < bool > > & getData ( ) const &; + /** + * Getter of the bit vectors. + * + * @return bit vectors + */ ext::map < SymbolType, ext::vector < bool > > && getData ( ) &&; + /** + * Reconstructs the indexed string from bit vectors. + * + * @return the original indexed string + */ ext::vector < SymbolType > getString ( ) const; + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ const ext::set < SymbolType > & getAlphabet ( ) const & { return this->template accessComponent < GeneralAlphabet > ( ).get ( ); } + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ ext::set < SymbolType > && getAlphabet ( ) && { return std::move ( this->template accessComponent < GeneralAlphabet > ( ).get ( ) ); } /** - * Sets the bit vector for given symbol - * @param tree root node to set + * Changes the bit vector for concrete symbol. + * + * \param symbol the changed symbol + * \param data the new bit vector */ void setBitVectorForSymbol ( SymbolType symbol, ext::vector < bool > data ); /** - * Removes symbol from the alphabet of symbol available in the regular expression - * @param symbol removed symbol from the alphabet + * Remover of a symbol from the alphabet. The symbol can be removed if it is not used in any of bit vector keys. + * + * \param symbol a symbol to remove. */ bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { return this->template accessComponent < GeneralAlphabet > ( ).remove ( symbol ); } /** - * Prints XML representation of the tree to the output stream. - * @param out output stream to which print the tree - * @param tree tree to print + * @copydoc alib::CommonBase<ObjectBase>::compare ( const ObjectBase & ) */ - virtual void operator >>( std::ostream & out ) const; - - virtual int compare ( const ObjectBase & other ) const { + virtual int compare ( const ObjectBase & other ) const override { if ( ext::type_index ( typeid ( * this ) ) == ext::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); return ext::type_index ( typeid ( * this ) ) - ext::type_index ( typeid ( other ) ); } - virtual int compare ( const BitParallelIndex & other ) const; + /** + * The actual compare method. + * + * \param other the other instance + * + * \returns the actual relation between two by type same index instances + */ + int compare ( const BitParallelIndex & other ) const; - virtual explicit operator std::string ( ) const; + /** + * @copydoc alib::CommonBase<ObjectBase>::operator >> ( std::ostream & ) + */ + virtual void operator >>( std::ostream & out ) const override; + + /** + * @copydoc alib::CommonBase<ObjectBase>::operator std::string ( ) + */ + virtual explicit operator std::string ( ) const override; - virtual object::ObjectBase * inc ( ) &&; + /** + * @copydoc alib::ObjectBase::inc() + */ + virtual object::ObjectBase * inc ( ) && override; }; } /* namespace stringology */ @@ -203,22 +267,54 @@ object::ObjectBase* BitParallelIndex < SymbolType >::inc() && { namespace core { +/** + * Helper class specifying constraints for the internal alphabet component of the index. + * + * \tparam SymbolType type of symbols of indexed string + */ template < class SymbolType > class SetConstraint < indexes::stringology::BitParallelIndex < SymbolType >, SymbolType, indexes::stringology::GeneralAlphabet > { public: + /** + * Returns true if the symbol is still used as key in mapping symbol to bit vector. + * + * \param index the tested index + * \param symbol the tested symbol + * + * \returns true if the symbol is used, false othervise + */ static bool used ( const indexes::stringology::BitParallelIndex < SymbolType > & index, const SymbolType & symbol ) { const ext::map < SymbolType, ext::vector < bool > > & content = index.getData ( ); return content.find( symbol ) != content.end(); } + /** + * Returns true as all symbols are possibly available to be elements of the alphabet. + * + * \param index the tested index + * \param symbol the tested symbol + * + * \returns true + */ static bool available ( const indexes::stringology::BitParallelIndex < SymbolType > &, const SymbolType & ) { return true; } + /** + * All symbols are valid as symbols of the alphabet. + * + * \param index the tested index + * \param symbol the tested symbol + */ static void valid ( const indexes::stringology::BitParallelIndex < SymbolType > &, const SymbolType & ) { } }; +/** + * Helper for normalisation of types specified by templates used as internal datatypes of symbols. + * + * \returns new instance of the index with default template parameters or unmodified instance if the template parameters were already the default ones + */ template < class SymbolType > struct normalize < indexes::stringology::BitParallelIndex < SymbolType > > { static indexes::stringology::BitParallelIndex < > eval ( indexes::stringology::BitParallelIndex < SymbolType > && value ) { diff --git a/alib2data/src/indexes/stringology/BNDMMatcher.cpp b/alib2data/src/indexes/stringology/BitSetIndex.cpp similarity index 67% rename from alib2data/src/indexes/stringology/BNDMMatcher.cpp rename to alib2data/src/indexes/stringology/BitSetIndex.cpp index 4cee2f4aaa..0e9bae6d1b 100644 --- a/alib2data/src/indexes/stringology/BNDMMatcher.cpp +++ b/alib2data/src/indexes/stringology/BitSetIndex.cpp @@ -1,22 +1,22 @@ /* - * BNDMMatcher.cpp + * BitSetIndex.cpp * * Created on: Jan 8, 2017 * Author: Jan Travnicek */ -#include "BNDMMatcher.h" +#include "BitSetIndex.h" #include <registration/ValuePrinterRegistration.hpp> #include <registration/XmlRegistration.hpp> namespace { -static auto valuePrinter = registration::ValuePrinterRegister < indexes::stringology::BNDMMatcher < > > ( ); +static auto valuePrinter = registration::ValuePrinterRegister < indexes::stringology::BitSetIndex < > > ( ); -static auto xmlWrite = registration::XmlWriterRegister < indexes::stringology::BNDMMatcher < > > ( ); -static auto xmlRead = registration::XmlReaderRegister < indexes::stringology::BNDMMatcher < > > ( ); +static auto xmlWrite = registration::XmlWriterRegister < indexes::stringology::BitSetIndex < > > ( ); +static auto xmlRead = registration::XmlReaderRegister < indexes::stringology::BitSetIndex < > > ( ); -static auto xmlGroup = registration::XmlRegisterTypeInGroup < object::Object, indexes::stringology::BNDMMatcher < > > ( ); +static auto xmlGroup = registration::XmlRegisterTypeInGroup < object::Object, indexes::stringology::BitSetIndex < > > ( ); } /* namespace */ diff --git a/alib2data/src/indexes/stringology/BitSetIndex.h b/alib2data/src/indexes/stringology/BitSetIndex.h new file mode 100644 index 0000000000..b083d7dbb2 --- /dev/null +++ b/alib2data/src/indexes/stringology/BitSetIndex.h @@ -0,0 +1,330 @@ +/* + * BitSetIndex.h + * + * This file is part of Algorithms library toolkit. + * Copyright (C) 2017 Jan Travnicek (jan.travnicek@fit.cvut.cz) + + * Algorithms library toolkit is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + + * Algorithms library toolkit is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with Algorithms library toolkit. If not, see <http://www.gnu.org/licenses/>. + * + * Created on: Jan 8, 2017 + * Author: Jan Travnicek + */ + +#ifndef BIT_SET_INDEX_H_ +#define BIT_SET_INDEX_H_ + +#include <alib/set> +#include <alib/string> +#include <alib/iostream> +#include <sstream> + +#include <common/DefaultSymbolType.h> + +#include <core/components.hpp> +#include <exception/CommonException.h> + +#include <object/UniqueObject.h> +#include <object/ObjectBase.h> + +#include <sax/FromXMLParserHelper.h> +#include <core/xmlApi.hpp> + +#include <container/ObjectsMap.h> + +#include <container/xml/ObjectsMap.h> +#include <container/xml/ObjectsBitset.h> + +#include <primitive/Bool.h> +#include <primitive/xml/Bool.h> +#include <alib/bitset> + +#include <alphabet/common/SymbolNormalize.h> + +#include <string/LinearString.h> +#include <string/xml/LinearString.h> + +namespace indexes { + +namespace stringology { + +class GeneralAlphabet; + +/** + * \brief Bit set string index. Stores a bit set for each symbol of the alphabet. The bit set of symbol a contains true on index i if symbol a is on i-th position in the indexed string. The class does not check whether the bit sets actually represent valid index. + * + * \tparam SymbolType type of symbols of indexed string + */ +template < class SymbolType = DefaultSymbolType, size_t BitmaskBitCount = 64 > +class BitSetIndex final : public object::ObjectBase, public core::Components < BitSetIndex < SymbolType >, ext::set < SymbolType >, component::Set, GeneralAlphabet > { + /** + * Representation of bit sets for each symbol of the alphabet. + */ + ext::map < SymbolType, ext::bitset < BitmaskBitCount > > m_vectors; + + /** + * The original indexed string. + */ + string::LinearString < SymbolType > m_string; + +public: + /** + * @copydoc ObjectBase::clone ( ) const & + */ + virtual ObjectBase * clone ( ) const & override; + + /** + * @copydoc ObjectBase::clone ( ) && + */ + virtual ObjectBase * clone ( ) && override; + + /** + * Creates a new instance of the index with concrete bit sets and original indexed string. + * + * \param vectors the bit sets + * \param string the original indexed string + */ + explicit BitSetIndex ( ext::map < SymbolType, ext::bitset < BitmaskBitCount > > vectors, string::LinearString < SymbolType > string ); + + /** + * Getter of the bit sets. + * + * @return bit sets + */ + const ext::map < SymbolType, ext::bitset < BitmaskBitCount > > & getData ( ) const &; + + /** + * Getter of the bit sets. + * + * @return bit sets + */ + ext::map < SymbolType, ext::bitset < BitmaskBitCount > > && getData ( ) &&; + + /** + * Getter of the original indexed string. + * + * @return the original indexed string + */ + const string::LinearString < SymbolType > & getString ( ) const &; + + /** + * Getter of the original indexed string. + * + * @return the original indexed string + */ + string::LinearString < SymbolType > && getString ( ) &&; + + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ + const ext::set < SymbolType > & getAlphabet ( ) const & { + return m_string.getAlphabet ( ); + } + + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ + ext::set < SymbolType > && getAlphabet ( ) && { + return m_string.getAlphabet ( ); + } + + /** + * Changes the bit vector for concrete symbol. + * + * \param symbol the changed symbol + * \param data the new bit vector + */ + void setBitVectorForSymbol ( SymbolType symbol, ext::bitset < BitmaskBitCount > data ); + + /** + * Remover of a symbol from the alphabet of the indexed string. The symbol can be removed if it is not used in any of bit vector keys. + * + * \param symbol a symbol to remove. + */ + bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { + return m_string.removeSymbol ( symbol ); + } + + /** + * @copydoc alib::CommonBase<ObjectBase>::compare ( const ObjectBase & ) + */ + virtual int compare ( const ObjectBase & other ) const override { + if ( ext::type_index ( typeid ( * this ) ) == ext::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); + + return ext::type_index ( typeid ( * this ) ) - ext::type_index ( typeid ( other ) ); + } + + /** + * The actual compare method. + * + * \param other the other instance + * + * \returns the actual relation between two by type same index instances + */ + int compare ( const BitSetIndex & other ) const; + + /** + * @copydoc alib::CommonBase<ObjectBase>::operator >> ( std::ostream & ) + */ + virtual void operator >>( std::ostream & out ) const override; + + /** + * @copydoc alib::CommonBase<ObjectBase>::operator std::string ( ) + */ + virtual explicit operator std::string ( ) const override; + + /** + * @copydoc alib::ObjectBase::inc() + */ + virtual object::ObjectBase * inc ( ) && override; +}; + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace indexes { + +namespace stringology { + +template < class SymbolType, size_t BitmaskBitCount > +BitSetIndex < SymbolType, BitmaskBitCount >::BitSetIndex ( ext::map < SymbolType, ext::bitset < BitmaskBitCount > > vectors, string::LinearString < SymbolType > string ) : m_vectors ( std::move ( vectors ) ), m_string ( std::move ( string ) ) { +} + +template < class SymbolType, size_t BitmaskBitCount > +object::ObjectBase * BitSetIndex < SymbolType, BitmaskBitCount >::clone ( ) const & { + return new BitSetIndex ( * this ); +} + +template < class SymbolType, size_t BitmaskBitCount > +object::ObjectBase * BitSetIndex < SymbolType, BitmaskBitCount >::clone ( ) && { + return new BitSetIndex ( std::move ( * this ) ); +} + +template < class SymbolType, size_t BitmaskBitCount > +const ext::map < SymbolType, ext::bitset < BitmaskBitCount > > & BitSetIndex < SymbolType, BitmaskBitCount >::getData ( ) const & { + return m_vectors; +} + +template < class SymbolType, size_t BitmaskBitCount > +ext::map < SymbolType, ext::bitset < BitmaskBitCount > > && BitSetIndex < SymbolType, BitmaskBitCount >::getData ( ) && { + return std::move ( m_vectors ); +} + +template < class SymbolType, size_t BitmaskBitCount > +const string::LinearString < SymbolType > & BitSetIndex < SymbolType, BitmaskBitCount >::getString ( ) const & { + return m_string; +} + +template < class SymbolType, size_t BitmaskBitCount > +string::LinearString < SymbolType > && BitSetIndex < SymbolType, BitmaskBitCount >::getString ( ) && { + return std::move ( m_string ); +} + +template < class SymbolType, size_t BitmaskBitCount > +void BitSetIndex < SymbolType, BitmaskBitCount >::setBitVectorForSymbol ( SymbolType symbol, ext::bitset < BitmaskBitCount > data ) { + this->m_vectors [ symbol ] = std::move ( data ); +} + +template < class SymbolType, size_t BitmaskBitCount > +void BitSetIndex < SymbolType, BitmaskBitCount >::operator >>( std::ostream & out ) const { + out << "(BitSetIndex " << this->m_vectors << ")"; +} + +template < class SymbolType, size_t BitmaskBitCount > +int BitSetIndex < SymbolType, BitmaskBitCount >::compare ( const BitSetIndex & other ) const { + auto first = ext::tie ( getData ( ), getAlphabet ( ) ); + auto second = ext::tie ( other.getData ( ), other.getAlphabet ( ) ); + + static ext::compare < decltype ( first ) > comp; + + return comp ( first, second ); +} + +template < class SymbolType, size_t BitmaskBitCount > +BitSetIndex < SymbolType, BitmaskBitCount >::operator std::string ( ) const { + std::stringstream ss; + ss << * this; + return ss.str ( ); +} + +template < class SymbolType, size_t BitmaskBitCount > +object::ObjectBase* BitSetIndex < SymbolType, BitmaskBitCount >::inc() && { + return new object::UniqueObject(object::Object(std::move(*this)), primitive::Integer(0)); +} + +} /* namespace stringology */ + +} /* namespace indexes */ + +namespace core { + +template < class SymbolType, size_t BitmaskBitCount > +struct normalize < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > > { + static indexes::stringology::BitSetIndex < DefaultSymbolType, BitmaskBitCount > eval ( indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > && value ) { + ext::map < DefaultSymbolType, ext::bitset < BitmaskBitCount > > vectors; + for ( std::pair < SymbolType, ext::bitset < BitmaskBitCount > > && vector : ext::make_moveable_map ( std::move ( value ).getData ( ) ) ) + vectors.insert ( std::make_pair ( alphabet::SymbolNormalize::normalizeSymbol ( std::move ( vector.first ) ), std::move ( vector.second ) ) ); + + string::LinearString < DefaultSymbolType > string = normalize < string::LinearString < SymbolType > >::eval ( std::move ( value ).getString ( ) ); + + return indexes::stringology::BitSetIndex < DefaultSymbolType, BitmaskBitCount > ( std::move ( vectors ), std::move ( string ) ); + } +}; + +template < class SymbolType, size_t BitmaskBitCount > +struct xmlApi < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > > { + static indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > parse ( ext::deque < sax::Token >::iterator & input ); + static bool first ( const ext::deque < sax::Token >::const_iterator & input ); + static const std::string & xmlTagName ( ); + static void compose ( ext::deque < sax::Token > & output, const indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > & data ); +}; + +template < class SymbolType, size_t BitmaskBitCount > +indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > xmlApi < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > >::parse ( ext::deque < sax::Token >::iterator & input ) { + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); + ext::map < SymbolType, ext::bitset < BitmaskBitCount > > data = core::xmlApi < ext::map < SymbolType, ext::bitset < BitmaskBitCount > > >::parse ( input ); + string::LinearString < SymbolType > string = core::xmlApi < string::LinearString < SymbolType > >::parse ( input ); + indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > res ( std::move ( data ), std::move ( string ) ); + + sax::FromXMLParserHelper::popToken ( input, sax::Token::TokenType::END_ELEMENT, xmlTagName ( ) ); + return res; +} + +template < class SymbolType, size_t BitmaskBitCount > +bool xmlApi < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > >::first ( const ext::deque < sax::Token >::const_iterator & input ) { + return sax::FromXMLParserHelper::isToken ( input, sax::Token::TokenType::START_ELEMENT, xmlTagName ( ) ); +} + +template < class SymbolType, size_t BitmaskBitCount > +const std::string & xmlApi < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > >::xmlTagName ( ) { + static std::string xmlTagName = "BitSetIndex"; + + return xmlTagName; +} + +template < class SymbolType, size_t BitmaskBitCount > +void xmlApi < indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > >::compose ( ext::deque < sax::Token > & output, const indexes::stringology::BitSetIndex < SymbolType, BitmaskBitCount > & index ) { + output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::START_ELEMENT ); + core::xmlApi < ext::map < SymbolType, ext::bitset < BitmaskBitCount > > >::compose ( output, index.getData ( ) ); + core::xmlApi < string::LinearString < SymbolType > >::compose ( output, index.getString ( ) ); + output.emplace_back ( xmlTagName ( ), sax::Token::TokenType::END_ELEMENT ); +} + +} /* namespace core */ + +#endif /* BIT_SET_INDEX_H_ */ diff --git a/alib2data/src/indexes/stringology/CompressedBitParallelIndex.h b/alib2data/src/indexes/stringology/CompressedBitParallelIndex.h index ab4df18925..4e28c753c4 100644 --- a/alib2data/src/indexes/stringology/CompressedBitParallelIndex.h +++ b/alib2data/src/indexes/stringology/CompressedBitParallelIndex.h @@ -60,20 +60,20 @@ namespace stringology { class GeneralAlphabet; /** - * \brief Compressed bit parallel string index. Stores a bit vector for each symbol of the alphabet. The bit vector of symbol a contains true on index i if symbol a is on i-th position in the indexed string. The class does not check whether the bit vectors actually represent valid index. The bit vectors are compressed with run length encoding packing runs of false + * \brief Compressed bit parallel string index. Stores a bit vector for each symbol of the alphabet. The bit vector of symbol a contains true on index i if symbol a is on i-th position in the indexed string. The class does not check whether the bit vectors actually represent valid index. The bit vectors are compressed with run length encoding packing runs of false values. * * \tparam SymbolType type of symbols of indexed string */ template < class SymbolType = DefaultSymbolType > class CompressedBitParallelIndex final : public object::ObjectBase, public core::Components < CompressedBitParallelIndex < SymbolType >, ext::set < SymbolType >, component::Set, GeneralAlphabet > { /** - * Representation of compressed bit vectors. + * Representation of compressed bit vectors for each symbol of the alphabet. */ ext::map < SymbolType, common::SparseBoolVector > m_vectors; public: /** - * @copydoc SuffixTrieNode::clone ( ) const & + * @copydoc ObjectBase::clone ( ) const & */ virtual ObjectBase * clone ( ) const & override; @@ -138,7 +138,7 @@ public: void setCompressedBitVectorForSymbol ( SymbolType symbol, common::SparseBoolVector data ); /** - * Remover of a symbol from the alphabet of indexed string. + * Remover of a symbol from the alphabet. The symbol can be removed if it is not used in any of bit vector keys. * * \param symbol a symbol to remove. */ diff --git a/alib2data/src/indexes/stringology/FactorOracleAutomaton.h b/alib2data/src/indexes/stringology/FactorOracleAutomaton.h index c6ecbd8740..f55a910a34 100644 --- a/alib2data/src/indexes/stringology/FactorOracleAutomaton.h +++ b/alib2data/src/indexes/stringology/FactorOracleAutomaton.h @@ -1,6 +1,22 @@ /* * FactorOracleAutomaton.h * + * This file is part of Algorithms library toolkit. + * Copyright (C) 2017 Jan Travnicek (jan.travnicek@fit.cvut.cz) + + * Algorithms library toolkit is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + + * Algorithms library toolkit is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with Algorithms library toolkit. If not, see <http://www.gnu.org/licenses/>. + * * Created on: Jan 8, 2017 * Author: Jan Travnicek */ @@ -34,74 +50,123 @@ namespace stringology { class GeneralAlphabet; /** - * Represents regular expression parsed from the XML. Regular expression is stored - * as a tree of RegExpElement. + * \brief Factor oracle automaton string index. Stores a deterministic finite automaton. The automaton is of exactly linear size as the indexed string. The automaton represents at least all factors of the indexed string. The class does not check whether the automaton represent valid index. + * + * \tparam SymbolType type of symbols of indexed string */ template < class SymbolType = DefaultSymbolType > class FactorOracleAutomaton final : public object::ObjectBase { -protected: + /** + * Representation of underlying automaton. + */ automaton::DFA < SymbolType, unsigned > m_automaton; public: /** - * @copydoc OracleTrieNode::clone ( ) const & + * @copydoc ObjectBase::clone ( ) const & */ - virtual ObjectBase * clone ( ) const &; + virtual ObjectBase * clone ( ) const & override; /** - * @copydoc OracleTrieNode::clone ( ) const & + * @copydoc ObjectBase::clone ( ) && */ - virtual ObjectBase * clone ( ) &&; + virtual ObjectBase * clone ( ) && override; + /** + * Creates a new instance of the index based on the raw factor oracle automaton. + * + * \param automaton the factor oracle automaton + */ explicit FactorOracleAutomaton ( automaton::DFA < SymbolType, unsigned > automaton ); /** - * @return Root node of the trie + * Getter of the raw factor oracle automaton + * + * @return the raw factor oracle automaton */ const automaton::DFA < SymbolType, unsigned > & getAutomaton ( ) const &; + /** + * Getter of the raw factor oracle automaton + * + * @return the raw factor oracle automaton + */ automaton::DFA < SymbolType, unsigned > && getAutomaton ( ) &&; + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ const ext::set < SymbolType > & getAlphabet ( ) const & { return m_automaton.getInputAlphabet ( ); } + /** + * Getter of the alphabet of the indexed string. + * + * \returns the alphabet of the indexed string + */ ext::set < SymbolType > && getAlphabet ( ) && { return std::move ( m_automaton ).getInputAlphabet ( ); } /** - * Removes symbol from the alphabet of symbol available in the regular expression - * @param symbol removed symbol from the alphabet + * Remover of a symbol from the alphabet. The symbol can be removed if it is not used in any of bit vector keys. + * + * \param symbol a symbol to remove. */ bool removeSymbolFromAlphabet ( const SymbolType & symbol ) { return m_automaton.removeInputSymbol ( symbol ); } + /** + * Getter of the backbone length of the automaton. The length is equal to the longest path through the automaton. + * \return the length of the backbone + */ unsigned getBackboneLength ( ) const { return m_automaton.getStates ( ).size ( ) - 1; } /** - * Prints XML representation of the tree to the output stream. - * @param out output stream to which print the tree - * @param tree tree to print + * @copydoc alib::CommonBase<ObjectBase>::compare ( const ObjectBase & ) */ - virtual void operator >>( std::ostream & out ) const; - - virtual int compare ( const ObjectBase & other ) const { + virtual int compare ( const ObjectBase & other ) const override { if ( ext::type_index ( typeid ( * this ) ) == ext::type_index ( typeid ( other ) ) ) return this->compare ( ( decltype ( * this ) )other ); return ext::type_index ( typeid ( * this ) ) - ext::type_index ( typeid ( other ) ); } - virtual int compare ( const FactorOracleAutomaton & other ) const; + /** + * The actual compare method. + * + * \param other the other instance + * + * \returns the actual relation between two by type same index instances + */ + int compare ( const FactorOracleAutomaton & other ) const; + + /** + * @copydoc alib::CommonBase<ObjectBase>::operator >> ( std::ostream & ) + */ + virtual void operator >>( std::ostream & out ) const override; - virtual explicit operator std::string ( ) const; + /** + * @copydoc alib::CommonBase<ObjectBase>::operator std::string ( ) + */ + virtual explicit operator std::string ( ) const override; + /** + * Cast operator to the underlying autoamton. + * + * \return the raw automaton + */ explicit operator automaton::DFA < SymbolType, unsigned > ( ) const; - virtual object::ObjectBase * inc ( ) &&; + /** + * @copydoc alib::ObjectBase::inc() + */ + virtual object::ObjectBase * inc ( ) && override; }; } /* namespace stringology */ @@ -174,6 +239,11 @@ object::ObjectBase* FactorOracleAutomaton < SymbolType >::inc() && { namespace core { +/** + * Helper for normalisation of types specified by templates used as internal datatypes of symbols. + * + * \returns new instance of the index with default template parameters or unmodified instance if the template parameters were already the default ones + */ template < class SymbolType > struct normalize < indexes::stringology::FactorOracleAutomaton < SymbolType > > { static indexes::stringology::FactorOracleAutomaton < > eval ( indexes::stringology::FactorOracleAutomaton < SymbolType > && value ) { -- GitLab