From 4c2f457da61b4b017d8bda1571deddf99e2a286f Mon Sep 17 00:00:00 2001 From: Tomas Capek <tomas@capek.io> Date: Wed, 28 Mar 2018 19:14:36 +0200 Subject: [PATCH] Refactor LevenshteinMatchingAutomaton. It now uses ext::pair<unsigned, unsigned> as a state type. --- .../matching/LevenshteinMatchingAutomaton.cpp | 2 +- .../matching/LevenshteinMatchingAutomaton.h | 72 ++++------ .../LevenshteinMatchingAutomatonTest.cpp | 125 ++++++++++-------- 3 files changed, 95 insertions(+), 104 deletions(-) diff --git a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.cpp b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.cpp index 1f53ed3a59..f2ac367f9d 100644 --- a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.cpp +++ b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.cpp @@ -12,7 +12,7 @@ namespace stringology { namespace matching { -auto LevenshteinMatchingAutomatonLinearString = registration::AbstractRegister <LevenshteinMatchingAutomaton, automaton::EpsilonNFA < DefaultSymbolType, unsigned, unsigned>, const string::LinearString < > &, unsigned > ( LevenshteinMatchingAutomaton::construct ); +auto LevenshteinMatchingAutomatonLinearString = registration::AbstractRegister <LevenshteinMatchingAutomaton, automaton::EpsilonNFA < DefaultSymbolType, void, ext::pair<unsigned int, unsigned int>>, const string::LinearString < > &, unsigned > ( LevenshteinMatchingAutomaton::construct ); } /* namespace matching */ diff --git a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h index 5cde7d4e19..c6dd1424f0 100644 --- a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h +++ b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h @@ -5,12 +5,13 @@ * Author: Tomas Capek */ -#ifndef _EXACT_MATCHING_AUTOMATON_H__ -#define _EXACT_MATCHING_AUTOMATON_H__ +#ifndef _LEVENSHTEIN_MATCHING_AUTOMATON_H__ +#define _LEVENSHTEIN_MATCHING_AUTOMATON_H__ #include <automaton/FSM/EpsilonNFA.h> -#include <automaton/simplify/UnreachableStatesRemover.h> #include <string/LinearString.h> +#include <stringology/matching/HammingMatchingAutomaton.h> + namespace stringology { @@ -24,63 +25,44 @@ public: * @return automata for aproximate string matching using Hamming algorithm */ template < class SymbolType > - static automaton::EpsilonNFA < SymbolType, unsigned, unsigned > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors); + static automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors); }; template < class SymbolType > - automaton::EpsilonNFA < SymbolType, unsigned, unsigned > LevenshteinMatchingAutomaton::construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors) { - automaton::EpsilonNFA < SymbolType, unsigned, unsigned > res( 0 ); - res.setInputAlphabet(pattern.getAlphabet()); - - // add k+1 paralel automatas (sfoeco type = exact matching) (where k is allowed_errors) - unsigned current_state = 0; - for (unsigned i = 0; i <= allowed_errors; i++) { - if ( current_state > 0 ) { - ++current_state; - res.addState(current_state); - } + automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > LevenshteinMatchingAutomaton::construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors) { + auto hamming_matching_automaton = stringology::matching::HammingMatchingAutomaton::construct(pattern, allowed_errors); - for(const SymbolType& symbol : pattern.getAlphabet()) { - res.addTransition( current_state, symbol, current_state); - } + automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > result (hamming_matching_automaton); - for(const SymbolType& symbol : pattern.getContent()) { - ++current_state; - res.addState( current_state ); - res.addTransition( current_state - 1, symbol, current_state ); - } - res.addFinalState( current_state ); - } + for (unsigned int i=0; i<pattern.getContent().size(); i++) { + for (unsigned int j=0; j<allowed_errors; j++) { + if (i < j) { + continue; + } - for (unsigned int i = 0; i < pattern.getContent().size(); i++) { - for (unsigned int j = 0; j < allowed_errors; j++) { - unsigned int states_count = pattern.getContent().size() + 1; + auto from = ext::make_pair(i, j); + auto to = ext::make_pair(i + 1, j + 1); - unsigned int from_state = j*states_count + i; - unsigned int to_state = (j+1)*states_count + i + 1; + // add diagonal transition representing deletion + result.addTransition(from, to); - for ( const SymbolType& symbol : pattern.getAlphabet()) { - if (symbol != pattern.getContent()[i]) { - // add diagonal transition on mistake - res.addTransition(from_state, symbol, to_state); + if (i == j) { + continue; + } - if ( i > j && i - 1 < pattern.getContent().size() ) { - // condition limits following to upper triangle && non-final states + to = ext::make_pair(i, j + 1); - // add vertical transition representing insertion - res.addTransition(from_state, symbol, (j + 1) * states_count + i); - } + for (const SymbolType& symbol : pattern.getAlphabet()) { + if (symbol != pattern.getContent()[i]) { + // add horizontal transition representing insertion + result.addTransition(from, symbol, to); } } - - // add epsilon transition representing deletion - res.addTransition(from_state, to_state); } } - // remove all inaccessible states from starting state - return automaton::simplify::UnreachableStatesRemover::remove(res); + return result; } @@ -88,4 +70,4 @@ template < class SymbolType > } /* namespace stringology */ -#endif /* _HAMMING_MATCHING_AUTOMATON_H__ */ +#endif /* _LEVENSHTEIN_MATCHING_AUTOMATON_H__ */ diff --git a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp index ce4c0fea04..4f9ab10ac9 100644 --- a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp @@ -14,66 +14,75 @@ void LevenshteinMatchingAutomatonTest::testSimpleConstruction() { string::LinearString <char> input_string(alphabet, ext::vector<char>{'a', 'b', 'c'}); auto resulting_automata = stringology::matching::LevenshteinMatchingAutomaton::construct(input_string, 2); - automaton::EpsilonNFA < char, unsigned, unsigned > res(0); - res.setInputAlphabet(ext::set<char>{'a', 'b', 'c', 'd'}); - res.setStates(ext::set<unsigned> {0, 1, 2, 3, 5, 6, 7, 10, 11}); - res.setFinalStates(ext::set<unsigned> {3, 7, 11}); - - res.addTransition(0, 'a', 1); // paralel exact matching automatas - - res.addTransition(1, 'b', 2); - res.addTransition(5, 'b', 6); - - res.addTransition(2, 'c', 3); - res.addTransition(6, 'c', 7); - res.addTransition(10, 'c', 11); - - res.addTransition(0, 'a', 0); // initial state's loops - res.addTransition(0, 'b', 0); - res.addTransition(0, 'c', 0); - res.addTransition(0, 'd', 0); - - res.addTransition(0, 'b', 5); // first mistake - res.addTransition(0, 'c', 5); - res.addTransition(0, 'd', 5); - - res.addTransition(0, 5); // deletion - - res.addTransition(1, 'a', 5); // insertion - res.addTransition(1, 'c', 5); - res.addTransition(1, 'd', 5); - - res.addTransition(1, 'a', 6); - res.addTransition(1, 'c', 6); - res.addTransition(1, 'd', 6); + typedef ext::pair<unsigned int, unsigned int> State; - res.addTransition(1, 6); // deletion - - res.addTransition(2, 'a', 6); // insertion - res.addTransition(2, 'b', 6); - res.addTransition(2, 'd', 6); - - res.addTransition(6, 'a', 10); // insertion - res.addTransition(6, 'b', 10); - res.addTransition(6, 'd', 10); - - res.addTransition(5, 'a', 10); - res.addTransition(5, 'c', 10); - res.addTransition(5, 'd', 10); - - res.addTransition(5, 10); // deletion - - res.addTransition(2, 'a', 7); - res.addTransition(2, 'b', 7); - res.addTransition(2, 'd', 7); - - res.addTransition(2, 7); // deletion - - res.addTransition(6, 'a', 11); - res.addTransition(6, 'b', 11); - res.addTransition(6, 'd', 11); + automaton::EpsilonNFA < char, void, State > res(ext::make_pair(0,0)); + res.setInputAlphabet(ext::set<char>{'a', 'b', 'c', 'd'}); - res.addTransition(6, 11); + State q0 = ext::make_pair(0,0); + State q1 = ext::make_pair(1,0); + State q2 = ext::make_pair(2,0); + State q3 = ext::make_pair(3,0); + State q4 = ext::make_pair(1,1); + State q5 = ext::make_pair(2,1); + State q6 = ext::make_pair(3,1); + State q7 = ext::make_pair(2,2); + State q8 = ext::make_pair(3,2); + + res.setStates(ext::set<State> {q0, q1, q2, q3, q4, q5, q6, q7, q8}); + res.setFinalStates(ext::set<State> {q3, q6, q8}); + + res.addTransition(q0, 'a', q1); // vertical transitions (exact matching automata) + + res.addTransition(q1, 'b', q2); + res.addTransition(q4, 'b', q5); + + res.addTransition(q2, 'c', q3); + res.addTransition(q5, 'c', q6); + res.addTransition(q7, 'c', q8); + + res.addTransition(q0, 'a', q0); // loops in initial state + res.addTransition(q0, 'b', q0); + res.addTransition(q0, 'c', q0); + res.addTransition(q0, 'd', q0); + + res.addTransition(q0, 'b', q4); // diagonal transitions representing replace + res.addTransition(q0, 'c', q4); + res.addTransition(q0, 'd', q4); + + res.addTransition(q0, q4); // deletion + + res.addTransition(q1, 'a', q5); + res.addTransition(q1, 'c', q5); + res.addTransition(q1, 'd', q5); + res.addTransition(q4, 'a', q7); + res.addTransition(q4, 'c', q7); + res.addTransition(q4, 'd', q7); + + res.addTransition(q1, q5); // deletion + res.addTransition(q4, q7); + + res.addTransition(q2, 'a', q6); + res.addTransition(q2, 'b', q6); + res.addTransition(q2, 'd', q6); + res.addTransition(q5, 'a', q8); + res.addTransition(q5, 'b', q8); + res.addTransition(q5, 'd', q8); + + res.addTransition(q2, q6); // deletion + res.addTransition(q5, q8); + + res.addTransition(q1, 'a', q4); // insertions + res.addTransition(q1, 'c', q4); + res.addTransition(q1, 'd', q4); + + res.addTransition(q2, 'a', q5); + res.addTransition(q2, 'b', q5); + res.addTransition(q2, 'd', q5); + + res.addTransition(q5, 'a', q7); + res.addTransition(q5, 'b', q7); + res.addTransition(q5, 'd', q7); CPPUNIT_ASSERT(resulting_automata == res); } -- GitLab