From 6d88134cc5cae4d03765f510705ef0f024c19f95 Mon Sep 17 00:00:00 2001
From: Tomas Capek <tomas@capek.io>
Date: Thu, 29 Mar 2018 22:48:13 +0200
Subject: [PATCH] Implement GeneralizedLevenshteinSequenceMatchingAutomaton.

---
 ...edLevenshteinSequenceMatchingAutomaton.cpp |  19 +++
 ...izedLevenshteinSequenceMatchingAutomaton.h |  69 +++++++++
 ...venshteinSequenceMatchingAutomatonTest.cpp | 136 ++++++++++++++++++
 ...LevenshteinSequenceMatchingAutomatonTest.h |  17 +++
 4 files changed, 241 insertions(+)
 create mode 100644 alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp
 create mode 100644 alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h
 create mode 100644 alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp
 create mode 100644 alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h

diff --git a/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp
new file mode 100644
index 0000000000..43ef43175f
--- /dev/null
+++ b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.cpp
@@ -0,0 +1,19 @@
+/*
+ * LevenshteinSequenceMatchingAutomaton.cpp
+ *
+ *  Created on: 29. 3. 2018
+ *      Author: Tomas Capek
+ */
+
+#include "GeneralizedLevenshteinSequenceMatchingAutomaton.h"
+#include <registration/AlgoRegistration.hpp>
+
+namespace stringology {
+
+namespace matching {
+
+auto GeneralizedLevenshteinSequenceMatchingAutomatonLinearString = registration::AbstractRegister <GeneralizedLevenshteinSequenceMatchingAutomaton, automaton::EpsilonNFA < DefaultSymbolType, void, ext::pair<unsigned int, unsigned int> >, const string::LinearString < > &, unsigned > ( GeneralizedLevenshteinSequenceMatchingAutomaton::construct );
+
+} /* namespace matching */
+
+} /* namespace stringology */
diff --git a/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h
new file mode 100644
index 0000000000..88abe21632
--- /dev/null
+++ b/alib2algo/src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h
@@ -0,0 +1,69 @@
+/*
+ * GeneralizedLevenshteinSequenceMatchingAutomaton.h
+ *
+ *  Created on: 29. 3. 2018
+ *      Author: Tomas Capek
+ */
+
+#ifndef _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__
+#define _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__
+
+#include <automaton/FSM/EpsilonNFA.h>
+#include <stringology/matching/GeneralizedLevenshteinMatchingAutomaton.h>
+#include <string/LinearString.h>
+
+
+namespace stringology {
+
+namespace matching {
+
+class GeneralizedLevenshteinSequenceMatchingAutomaton {
+public:
+	/**
+	 * Creates Generalized Levenshtein matching automata for sequence matching.
+	 *
+	 * @return automata for aproximate sequence matching using Levenshtein method.
+	 */
+	template < class SymbolType >
+	static automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors);
+};
+
+template < class SymbolType >
+automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > GeneralizedLevenshteinSequenceMatchingAutomaton::construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors) {
+	auto result = stringology::matching::GeneralizedLevenshteinMatchingAutomaton::construct(pattern, allowed_errors);
+
+	for (unsigned int j = 0; j<allowed_errors + 1; j++) {
+		for (unsigned int i = j; i<pattern.getContent().size(); i++) {
+			auto current_state = ext::make_pair(i, j);
+
+			for (const SymbolType & symbol : pattern.getAlphabet()) {
+				if (symbol != pattern.getContent()[i]) {
+					result.addTransition(current_state, symbol, current_state);
+				}
+			}
+		}
+	}
+
+	for (unsigned int j = 0; j<allowed_errors; j++) {
+		for (unsigned int i = j; i<pattern.getContent().size(); i++) {
+			if (i+1 < pattern.getContent().size()) {
+				auto transpose_state = ext::make_pair(pattern.getContent().size()+1+i, j);
+
+				for (const SymbolType & symbol : pattern.getAlphabet()) {
+					if (symbol != pattern.getContent()[i]) {
+						result.addTransition(transpose_state, symbol, transpose_state);
+					}
+				}
+			}
+		}
+	}
+
+	return result;
+}
+
+
+} /* namespace matching */
+
+} /* namespace stringology */
+
+#endif /* _GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATON_H__ */
diff --git a/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp
new file mode 100644
index 0000000000..d4d786c582
--- /dev/null
+++ b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.cpp
@@ -0,0 +1,136 @@
+#include <stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomaton.h>
+#include <automaton/FSM/NFA.h>
+#include <string/LinearString.h>
+
+#include "GeneralizedLevenshteinSequenceMatchingAutomatonTest.h"
+
+
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( GeneralizedLevenshteinSequenceMatchingAutomatonTest, "stringology" );
+CPPUNIT_TEST_SUITE_REGISTRATION ( GeneralizedLevenshteinSequenceMatchingAutomatonTest );
+
+void GeneralizedLevenshteinSequenceMatchingAutomatonTest::testSimpleConstruction() {
+  ext::set<char> alphabet{'a', 'b', 'c', 'd'};
+  string::LinearString <char> input_string(alphabet, ext::vector<char>{'a', 'b', 'c'});
+  auto resulting_automata = stringology::matching::GeneralizedLevenshteinSequenceMatchingAutomaton::construct(input_string, 2);
+
+  typedef ext::pair<unsigned int, unsigned int> State;
+
+  automaton::EpsilonNFA < char, void, State > test(ext::make_pair(0,0));
+  test.setInputAlphabet(ext::set<char>{'a', 'b', 'c', 'd'});
+
+  State q0 = ext::make_pair(0,0);
+  State q1 = ext::make_pair(1,0);
+  State q2 = ext::make_pair(2,0);
+  State q3 = ext::make_pair(3,0);
+  State q4 = ext::make_pair(1,1);
+  State q5 = ext::make_pair(2,1);
+  State q6 = ext::make_pair(3,1);
+  State q7 = ext::make_pair(2,2);
+  State q8 = ext::make_pair(3,2);
+
+  State r3 = ext::make_pair(5,1);
+  State r2 = ext::make_pair(5,0);
+  State r1 = ext::make_pair(4,0);
+
+  test.setStates(ext::set<State> {q0, q1, q2, q3, q4, q5, q6, q7, q8, r1, r2, r3});
+  test.setFinalStates(ext::set<State> {q3, q6, q8});
+
+  test.addTransition(q0, 'a', q1); // vertical transitions (exact matching automata)
+
+  test.addTransition(q1, 'b', q2);
+  test.addTransition(q4, 'b', q5);
+
+  test.addTransition(q2, 'c', q3);
+  test.addTransition(q5, 'c', q6);
+  test.addTransition(q7, 'c', q8);
+
+  test.addTransition(q0, 'a', q0); // loops in initial state
+  test.addTransition(q0, 'b', q0);
+  test.addTransition(q0, 'c', q0);
+  test.addTransition(q0, 'd', q0);
+
+  test.addTransition(q0, 'b', q4); // diagonal transitions reptestenting replace
+  test.addTransition(q0, 'c', q4);
+  test.addTransition(q0, 'd', q4);
+
+  test.addTransition(q0, q4); // deletion
+
+  test.addTransition(q1, 'a', q5);
+  test.addTransition(q1, 'c', q5);
+  test.addTransition(q1, 'd', q5);
+  test.addTransition(q4, 'a', q7);
+  test.addTransition(q4, 'c', q7);
+  test.addTransition(q4, 'd', q7);
+
+  test.addTransition(q1, q5); // deletion
+  test.addTransition(q4, q7);
+
+  test.addTransition(q2, 'a', q6);
+  test.addTransition(q2, 'b', q6);
+  test.addTransition(q2, 'd', q6);
+  test.addTransition(q5, 'a', q8);
+  test.addTransition(q5, 'b', q8);
+  test.addTransition(q5, 'd', q8);
+
+  test.addTransition(q2, q6); // deletion
+  test.addTransition(q5, q8);
+
+  test.addTransition(q1, 'a', q4); // insertions
+  test.addTransition(q1, 'c', q4);
+  test.addTransition(q1, 'd', q4);
+
+  test.addTransition(q2, 'a', q5);
+  test.addTransition(q2, 'b', q5);
+  test.addTransition(q2, 'd', q5);
+
+  test.addTransition(q5, 'a', q7);
+  test.addTransition(q5, 'b', q7);
+  test.addTransition(q5, 'd', q7);
+
+  test.addTransition(q1, 'a', q1); // loops for sequence matching
+  test.addTransition(q1, 'c', q1);
+  test.addTransition(q1, 'd', q1);
+
+  test.addTransition(q2, 'a', q2);
+  test.addTransition(q2, 'b', q2);
+  test.addTransition(q2, 'd', q2);
+
+  test.addTransition(q4, 'a', q4);
+  test.addTransition(q4, 'c', q4);
+  test.addTransition(q4, 'd', q4);
+
+  test.addTransition(q5, 'a', q5);
+  test.addTransition(q5, 'b', q5);
+  test.addTransition(q5, 'd', q5);
+
+  test.addTransition(q7, 'a', q7);
+  test.addTransition(q7, 'b', q7);
+  test.addTransition(q7, 'd', q7);
+
+  test.addTransition(q0, 'b', r1); // transposition
+  test.addTransition(r1, 'a', q5);
+
+  test.addTransition(q1, 'c', r2);
+  test.addTransition(r2, 'b', q6);
+
+  test.addTransition(q4, 'c', r3);
+  test.addTransition(r3, 'b', q8);
+
+  test.addTransition(r1, 'b', r1); // loops in tranposition states
+  test.addTransition(r1, 'c', r1);
+  test.addTransition(r1, 'd', r1);
+
+  test.addTransition(r2, 'a', r2);
+  test.addTransition(r2, 'c', r2);
+  test.addTransition(r2, 'd', r2);
+
+  test.addTransition(r3, 'a', r3);
+  test.addTransition(r3, 'c', r3);
+  test.addTransition(r3, 'd', r3);
+
+  CPPUNIT_ASSERT(resulting_automata == test);
+}
+
+void GeneralizedLevenshteinSequenceMatchingAutomatonTest::setUp() { }
+
+void GeneralizedLevenshteinSequenceMatchingAutomatonTest::tearDown() { }
diff --git a/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h
new file mode 100644
index 0000000000..5e02ace748
--- /dev/null
+++ b/alib2algo/test-src/stringology/matching/GeneralizedLevenshteinSequenceMatchingAutomatonTest.h
@@ -0,0 +1,17 @@
+#ifndef GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_
+#define GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_
+
+#include <cppunit/extensions/HelperMacros.h>
+
+class GeneralizedLevenshteinSequenceMatchingAutomatonTest : public CppUnit::TestFixture {
+	CPPUNIT_TEST_SUITE(GeneralizedLevenshteinSequenceMatchingAutomatonTest);
+        CPPUNIT_TEST(testSimpleConstruction);
+    CPPUNIT_TEST_SUITE_END();
+
+public:
+    void setUp ( );
+    void tearDown ( );
+
+    void testSimpleConstruction();
+};
+#endif //GENERALIZED_LEVENSHTEIN_SEQUENCE_MATCHING_AUTOMATA_TEST_H_
-- 
GitLab