From 6d88134cc5cae4d03765f510705ef0f024c19f95 Mon Sep 17 00:00:00 2001 From: Tomas Capek <tomas@capek.io> Date: Thu, 29 Mar 2018 22:48:13 +0200 Subject: [PATCH] Implement GeneralizedLevenshteinSequenceMatchingAutomaton. --- ...edLevenshteinSequenceMatchingAutomaton.cpp | 19 +++ ...izedLevenshteinSequenceMatchingAutomaton.h | 69 +++++++++ ...venshteinSequenceMatchingAutomatonTest.cpp | 136 ++++++++++++++++++ ...LevenshteinSequenceMatchingAutomatonTest.h | 17 +++ 4 files changed, 241 insertions(+) create mode 100644 alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp create mode 100644 alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h create mode 100644 alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp create mode 100644 alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h diff --git a/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp new file mode 100644 index 0000000000..43ef43175f --- /dev/null +++ b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp @@ -0,0 +1,19 @@ +/* + * LevenshteinSequenceMatchingAutomaton.cpp + * + * Created on: 29. 3. 2018 + * Author: Tomas Capek + */ + +#include "GeneralizedLevenshteinSequenceMatchingAutomaton.h" +#include <registration/AlgoRegistration.hpp> + +namespace stringology { + +namespace matching { + +auto GeneralizedLevenshteinSequenceMatchingAutomatonLinearString = registration::AbstractRegister <GeneralizedLevenshteinSequenceMatchingAutomaton, automaton::EpsilonNFA < DefaultSymbolType, void, ext::pair<unsigned int, unsigned int> >, const string::LinearString < > &, unsigned > ( GeneralizedLevenshteinSequenceMatchingAutomaton::construct ); + +} /* namespace matching */ + +} /* namespace stringology */ diff --git a/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h new file mode 100644 index 0000000000..88abe21632 --- /dev/null +++ b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h @@ -0,0 +1,69 @@ +/* + * GeneralizedLevenshteinSequenceMatchingAutomaton.h + * + * Created on: 29. 3. 2018 + * Author: Tomas Capek + */ + +#ifndef _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__ +#define _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__ + +#include <automaton/FSM/EpsilonNFA.h> +#include <stringology/matching/GeneralizedLevenshteinMatchingAutomaton.h> +#include <string/LinearString.h> + + +namespace stringology { + +namespace matching { + +class GeneralizedLevenshteinSequenceMatchingAutomaton { +public: + /** + * Creates Generalized Levenshtein matching automata for sequence matching. + * + * @return automata for aproximate sequence matching using Levenshtein method. + */ + template < class SymbolType > + static automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors); +}; + +template < class SymbolType > +automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > GeneralizedLevenshteinSequenceMatchingAutomaton::construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors) { + auto result = stringology::matching::GeneralizedLevenshteinMatchingAutomaton::construct(pattern, allowed_errors); + + for (unsigned int j = 0; j<allowed_errors + 1; j++) { + for (unsigned int i = j; i<pattern.getContent().size(); i++) { + auto current_state = ext::make_pair(i, j); + + for (const SymbolType & symbol : pattern.getAlphabet()) { + if (symbol != pattern.getContent()[i]) { + result.addTransition(current_state, symbol, current_state); + } + } + } + } + + for (unsigned int j = 0; j<allowed_errors; j++) { + for (unsigned int i = j; i<pattern.getContent().size(); i++) { + if (i+1 < pattern.getContent().size()) { + auto transpose_state = ext::make_pair(pattern.getContent().size()+1+i, j); + + for (const SymbolType & symbol : pattern.getAlphabet()) { + if (symbol != pattern.getContent()[i]) { + result.addTransition(transpose_state, symbol, transpose_state); + } + } + } + } + } + + return result; +} + + +} /* namespace matching */ + +} /* namespace stringology */ + +#endif /* _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__ */ diff --git a/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp new file mode 100644 index 0000000000..d4d786c582 --- /dev/null +++ b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp @@ -0,0 +1,136 @@ +#include <stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h> +#include <automaton/FSM/NFA.h> +#include <string/LinearString.h> + +#include "GeneralizedLevenshteinSequenceMatchingAutomatonTest.h" + + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( GeneralizedLevenshteinSequenceMatchingAutomatonTest, "stringology" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( GeneralizedLevenshteinSequenceMatchingAutomatonTest ); + +void GeneralizedLevenshteinSequenceMatchingAutomatonTest::testSimpleConstruction() { + ext::set<char> alphabet{'a', 'b', 'c', 'd'}; + string::LinearString <char> input_string(alphabet, ext::vector<char>{'a', 'b', 'c'}); + auto resulting_automata = stringology::matching::GeneralizedLevenshteinSequenceMatchingAutomaton::construct(input_string, 2); + + typedef ext::pair<unsigned int, unsigned int> State; + + automaton::EpsilonNFA < char, void, State > test(ext::make_pair(0,0)); + test.setInputAlphabet(ext::set<char>{'a', 'b', 'c', 'd'}); + + State q0 = ext::make_pair(0,0); + State q1 = ext::make_pair(1,0); + State q2 = ext::make_pair(2,0); + State q3 = ext::make_pair(3,0); + State q4 = ext::make_pair(1,1); + State q5 = ext::make_pair(2,1); + State q6 = ext::make_pair(3,1); + State q7 = ext::make_pair(2,2); + State q8 = ext::make_pair(3,2); + + State r3 = ext::make_pair(5,1); + State r2 = ext::make_pair(5,0); + State r1 = ext::make_pair(4,0); + + test.setStates(ext::set<State> {q0, q1, q2, q3, q4, q5, q6, q7, q8, r1, r2, r3}); + test.setFinalStates(ext::set<State> {q3, q6, q8}); + + test.addTransition(q0, 'a', q1); // vertical transitions (exact matching automata) + + test.addTransition(q1, 'b', q2); + test.addTransition(q4, 'b', q5); + + test.addTransition(q2, 'c', q3); + test.addTransition(q5, 'c', q6); + test.addTransition(q7, 'c', q8); + + test.addTransition(q0, 'a', q0); // loops in initial state + test.addTransition(q0, 'b', q0); + test.addTransition(q0, 'c', q0); + test.addTransition(q0, 'd', q0); + + test.addTransition(q0, 'b', q4); // diagonal transitions reptestenting replace + test.addTransition(q0, 'c', q4); + test.addTransition(q0, 'd', q4); + + test.addTransition(q0, q4); // deletion + + test.addTransition(q1, 'a', q5); + test.addTransition(q1, 'c', q5); + test.addTransition(q1, 'd', q5); + test.addTransition(q4, 'a', q7); + test.addTransition(q4, 'c', q7); + test.addTransition(q4, 'd', q7); + + test.addTransition(q1, q5); // deletion + test.addTransition(q4, q7); + + test.addTransition(q2, 'a', q6); + test.addTransition(q2, 'b', q6); + test.addTransition(q2, 'd', q6); + test.addTransition(q5, 'a', q8); + test.addTransition(q5, 'b', q8); + test.addTransition(q5, 'd', q8); + + test.addTransition(q2, q6); // deletion + test.addTransition(q5, q8); + + test.addTransition(q1, 'a', q4); // insertions + test.addTransition(q1, 'c', q4); + test.addTransition(q1, 'd', q4); + + test.addTransition(q2, 'a', q5); + test.addTransition(q2, 'b', q5); + test.addTransition(q2, 'd', q5); + + test.addTransition(q5, 'a', q7); + test.addTransition(q5, 'b', q7); + test.addTransition(q5, 'd', q7); + + test.addTransition(q1, 'a', q1); // loops for sequence matching + test.addTransition(q1, 'c', q1); + test.addTransition(q1, 'd', q1); + + test.addTransition(q2, 'a', q2); + test.addTransition(q2, 'b', q2); + test.addTransition(q2, 'd', q2); + + test.addTransition(q4, 'a', q4); + test.addTransition(q4, 'c', q4); + test.addTransition(q4, 'd', q4); + + test.addTransition(q5, 'a', q5); + test.addTransition(q5, 'b', q5); + test.addTransition(q5, 'd', q5); + + test.addTransition(q7, 'a', q7); + test.addTransition(q7, 'b', q7); + test.addTransition(q7, 'd', q7); + + test.addTransition(q0, 'b', r1); // transposition + test.addTransition(r1, 'a', q5); + + test.addTransition(q1, 'c', r2); + test.addTransition(r2, 'b', q6); + + test.addTransition(q4, 'c', r3); + test.addTransition(r3, 'b', q8); + + test.addTransition(r1, 'b', r1); // loops in tranposition states + test.addTransition(r1, 'c', r1); + test.addTransition(r1, 'd', r1); + + test.addTransition(r2, 'a', r2); + test.addTransition(r2, 'c', r2); + test.addTransition(r2, 'd', r2); + + test.addTransition(r3, 'a', r3); + test.addTransition(r3, 'c', r3); + test.addTransition(r3, 'd', r3); + + CPPUNIT_ASSERT(resulting_automata == test); +} + +void GeneralizedLevenshteinSequenceMatchingAutomatonTest::setUp() { } + +void GeneralizedLevenshteinSequenceMatchingAutomatonTest::tearDown() { } diff --git a/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h new file mode 100644 index 0000000000..5e02ace748 --- /dev/null +++ b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h @@ -0,0 +1,17 @@ +#ifndef GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_ +#define GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class GeneralizedLevenshteinSequenceMatchingAutomatonTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(GeneralizedLevenshteinSequenceMatchingAutomatonTest); + CPPUNIT_TEST(testSimpleConstruction); + CPPUNIT_TEST_SUITE_END(); + +public: + void setUp ( ); + void tearDown ( ); + + void testSimpleConstruction(); +}; +#endif //GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_ -- GitLab