From a3972834707d39f9c25e2172c7af70f5e2703e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Capek?= <tomas@capek.io> Date: Fri, 13 Apr 2018 18:57:54 +0200 Subject: [PATCH] Implement WildcardLinearString into HammingMatchingAutomaton. --- .../matching/HammingMatchingAutomaton.h | 89 ++++++++++++++++++- .../matching/HammingMatchingAutomatonTest.cpp | 47 +++++++++- .../matching/HammingMatchingAutomatonTest.h | 6 +- 3 files changed, 138 insertions(+), 4 deletions(-) diff --git a/alib2algo/src/stringology/matching/HammingMatchingAutomaton.h b/alib2algo/src/stringology/matching/HammingMatchingAutomaton.h index 495856e8e1..8ea6eabbdc 100644 --- a/alib2algo/src/stringology/matching/HammingMatchingAutomaton.h +++ b/alib2algo/src/stringology/matching/HammingMatchingAutomaton.h @@ -3,7 +3,7 @@ * * Created on: 12. 3. 2018 * Author: Tomas Capek - */ +*/ #ifndef _HAMMING_MATCHING_AUTOMATON_H__ #define _HAMMING_MATCHING_AUTOMATON_H__ @@ -11,6 +11,7 @@ #include <automaton/FSM/NFA.h> #include <automaton/simplify/UnreachableStatesRemover.h> #include <string/LinearString.h> +#include <string/WildcardLinearString.h> namespace stringology { @@ -25,6 +26,22 @@ public: */ template < class SymbolType > static automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors); + + /** + * Creates Hamming matching automata. + * + * @return automata for aproximate string matching using Hamming algorithm + */ + template < class SymbolType > + static automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > construct(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors); + + /** + * Creates Hamming matching automata and won't remove useless states. + * + * @return automata for aproximate string matching using Hamming algorithm. This automata won't have useless states removed. + */ + template < class SymbolType > + static automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > construct_unclean(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors); }; template < class SymbolType > @@ -76,6 +93,76 @@ automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > HammingMatc } +template < class SymbolType > +automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > HammingMatchingAutomaton::construct_unclean(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors) { + automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int > > result( ext::make_pair(0, 0) ); + result.setInputAlphabet(pattern.getAlphabet()); + + SymbolType wildcard = pattern.getWildcardSymbol(); + ext::set<SymbolType> alphabet_without_wildcard = pattern.getAlphabet(); + alphabet_without_wildcard.erase(wildcard); + + // add k+1 paralel automatas (sfoeco type = exact matching) (where k is allowed_errors) + for (unsigned int i = 0; i<pattern.getContent().size() + 1; i++) { + for (unsigned int j = 0; j<allowed_errors + 1; j++) { + result.addState(ext::make_pair(i, j)); + if (i == pattern.getContent().size()) { + result.addFinalState(ext::make_pair(i, j)); + } + } + } + + for (unsigned int i = 0; i<allowed_errors + 1; i++) { + for (const SymbolType& symbol : alphabet_without_wildcard) { + auto initial_state = ext::make_pair(0, i); + + result.addTransition(initial_state, symbol, initial_state); + } + } + + for (unsigned int i = 0; i<pattern.getContent().size(); i++) { + for (unsigned int j = 0; j < allowed_errors + 1; j++) { + auto from = ext::make_pair(i, j); + auto to = ext::make_pair(i+1, j); + if (pattern.getContent()[i] == pattern.getWildcardSymbol()) { + for (const SymbolType& symbol : alphabet_without_wildcard) { + result.addTransition(from, symbol, to); + } + } else { + result.addTransition(from, pattern.getContent()[i], to); + } + } + } + + // add diagonal addTransition + for (unsigned int i = 0; i<pattern.getContent().size(); i++) { + for (unsigned int j = 0; j<allowed_errors; j++) { + auto from = ext::make_pair(i, j); + auto to = ext::make_pair(i + 1, j + 1); + + if (pattern.getContent()[i] == wildcard) { + continue; + } + + for ( const SymbolType & symbol : alphabet_without_wildcard ) { + if (symbol != pattern.getContent()[i]) { + result.addTransition(from, symbol, to); + } + } + } + } + + return result; +} + +template < class SymbolType > +automaton::NFA < SymbolType, ext::pair<unsigned int, unsigned int> > HammingMatchingAutomaton::construct(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors) { + auto result = HammingMatchingAutomaton::construct_unclean(pattern, allowed_errors); + + // remove all inaccessible states from state + return automaton::simplify::UnreachableStatesRemover::remove(result); +} + } /* namespace matching */ } /* namespace stringology */ diff --git a/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.cpp b/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.cpp index fa005f1846..0b25ab87d4 100644 --- a/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.cpp @@ -1,11 +1,11 @@ #include <stringology/matching/HammingMatchingAutomaton.h> #include <automaton/FSM/NFA.h> #include <string/LinearString.h> +#include <string/WildcardLinearString.h> #include "HammingMatchingAutomatonTest.h" - CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( HammingMatchingAutomatonTest, "stringology" ); CPPUNIT_TEST_SUITE_REGISTRATION ( HammingMatchingAutomatonTest ); @@ -67,6 +67,51 @@ void HammingMatchingAutomatonTest::testSimpleConstruction() { CPPUNIT_ASSERT(resulting_automata == res); } + +void HammingMatchingAutomatonTest::testSimpleWildcardConstruction() { + ext::set<char> alphabet{'a', 'b', '@'}; + string::WildcardLinearString <char> input_string(alphabet, ext::vector<char>{'a', '@', 'b'}, '@'); + auto resulting_automata = stringology::matching::HammingMatchingAutomaton::construct(input_string, 2); + + typedef ext::pair<unsigned int, unsigned int> State; + + automaton::NFA < char, State > res(ext::make_pair(0,0)); + res.setInputAlphabet(alphabet); + + State q0 = ext::make_pair(0,0); + State q1 = ext::make_pair(1,0); + State q2 = ext::make_pair(2,0); + State q3 = ext::make_pair(3,0); + State q4 = ext::make_pair(1,1); + State q5 = ext::make_pair(2,1); + State q6 = ext::make_pair(3,1); + State q7 = ext::make_pair(3,2); + + res.setStates(ext::set<State> {q0, q1, q2, q3, q4, q5, q6, q7}); + res.setFinalStates(ext::set<State> {q3, q6, q7}); + + res.addTransition(q0, 'a', q0); // initial loops + res.addTransition(q0, 'b', q0); + + res.addTransition(q0, 'a', q1); // 3 simple matching automatas (thrid is not connected) + + res.addTransition(q1, 'a', q2); + res.addTransition(q1, 'b', q2); + res.addTransition(q4, 'a', q5); + res.addTransition(q4, 'b', q5); + + res.addTransition(q2, 'b', q3); + res.addTransition(q5, 'b', q6); + + res.addTransition(q0, 'b', q4); // error transitions + + res.addTransition(q2, 'a', q6); + res.addTransition(q5, 'a', q7); + + CPPUNIT_ASSERT(resulting_automata == res); +} + + void HammingMatchingAutomatonTest::setUp() { } void HammingMatchingAutomatonTest::tearDown() { } diff --git a/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.h b/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.h index 6471d078bb..caff8e0d7d 100644 --- a/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.h +++ b/alib2algo/test-src/stringology/matching/HammingMatchingAutomatonTest.h @@ -5,13 +5,15 @@ class HammingMatchingAutomatonTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(HammingMatchingAutomatonTest); - CPPUNIT_TEST(testSimpleConstruction); - CPPUNIT_TEST_SUITE_END(); + CPPUNIT_TEST(testSimpleConstruction); + CPPUNIT_TEST(testSimpleWildcardConstruction); + CPPUNIT_TEST_SUITE_END(); public: void setUp ( ); void tearDown ( ); void testSimpleConstruction(); + void testSimpleWildcardConstruction(); }; #endif //HAMMING_MATCHING_AUTOMATA_TEST_H_ -- GitLab