From 700f84166172412387e7273a29c924017ccfd78a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Capek?= <tomas@capek.io> Date: Sun, 15 Apr 2018 10:48:20 +0200 Subject: [PATCH] Implement WildcardLinearString into LevenshteinMatchingAutomaton --- .../matching/LevenshteinMatchingAutomaton.h | 47 +++++++++++++- .../LevenshteinMatchingAutomatonTest.cpp | 61 +++++++++++++++++++ .../LevenshteinMatchingAutomatonTest.h | 6 +- 3 files changed, 111 insertions(+), 3 deletions(-) diff --git a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h index 38f3add4a8..add20ca732 100644 --- a/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h +++ b/alib2algo/src/stringology/matching/LevenshteinMatchingAutomaton.h @@ -9,7 +9,9 @@ #define _LEVENSHTEIN_MATCHING_AUTOMATON_H__ #include <automaton/FSM/EpsilonNFA.h> +#include <automaton/simplify/UnreachableStatesRemover.h> #include <string/LinearString.h> +#include <string/WildcardLinearString.h> #include <stringology/matching/HammingMatchingAutomaton.h> @@ -20,12 +22,20 @@ namespace matching { class LevenshteinMatchingAutomaton { public: /** - * Creates Levenshtein matching automata. + * Creates Levenshtein matching automata form LinearString. * * @return automata for aproximate string matching using Hamming algorithm */ template < class SymbolType > static automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > construct(const string::LinearString < SymbolType > & pattern, unsigned int allowed_errors); + + /** + * Creates Levenshtein matching automata from WildcardLinearString + * + * @return automata for aproximate string matching using Hamming algorithm + */ + template < class SymbolType > + static automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > construct(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors); }; @@ -62,6 +72,41 @@ automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> } +template < class SymbolType > +automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > LevenshteinMatchingAutomaton::construct(const string::WildcardLinearString < SymbolType > & pattern, unsigned int allowed_errors) { + auto hamming_matching_automaton = stringology::matching::HammingMatchingAutomaton::construct_unclean(pattern, allowed_errors); + + automaton::EpsilonNFA < SymbolType, void, ext::pair<unsigned int, unsigned int> > result (hamming_matching_automaton); + + ext::set<SymbolType> alphabet_without_wildcard = pattern.getAlphabet(); + alphabet_without_wildcard.erase(pattern.getWildcardSymbol()); + + for (unsigned int j = 0; j<allowed_errors; j++) { + for (unsigned int i = j; i<pattern.getContent().size(); i++) { + auto from = ext::make_pair(i, j); + auto to = ext::make_pair(i + 1, j + 1); + + // add diagonal transition representing deletion + result.addTransition(from, to); + + if (i == j) { + continue; + } + + to = ext::make_pair(i, j + 1); + + for (const SymbolType& symbol : alphabet_without_wildcard) { + // add horizontal transition representing insertion + result.addTransition(from, symbol, to); + } + } + } + + return automaton::simplify::UnreachableStatesRemover::remove(result); +} + + + } /* namespace matching */ } /* namespace stringology */ diff --git a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp index 4f9ab10ac9..301b53d142 100644 --- a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp +++ b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.cpp @@ -87,6 +87,67 @@ void LevenshteinMatchingAutomatonTest::testSimpleConstruction() { CPPUNIT_ASSERT(resulting_automata == res); } +void LevenshteinMatchingAutomatonTest::testSimpleWildcardConstruction() { + ext::set<char> alphabet{'a', 'b', '@'}; + string::WildcardLinearString <char> input_string(alphabet, ext::vector<char>{'a', '@', 'b'}, '@'); + auto resulting_automata = stringology::matching::LevenshteinMatchingAutomaton::construct(input_string, 2); + + typedef ext::pair<unsigned int, unsigned int> State; + + automaton::EpsilonNFA < char, void, State > res(ext::make_pair(0,0)); + res.setInputAlphabet(alphabet); + + State q0 = ext::make_pair(0,0); + State q1 = ext::make_pair(1,0); + State q2 = ext::make_pair(2,0); + State q3 = ext::make_pair(3,0); + State q4 = ext::make_pair(1,1); + State q5 = ext::make_pair(2,1); + State q6 = ext::make_pair(3,1); + State q7 = ext::make_pair(2,2); + State q8 = ext::make_pair(3,2); + + res.setStates(ext::set<State> {q0, q1, q2, q3, q4, q5, q6, q7, q8}); + res.setFinalStates(ext::set<State> {q3, q6, q8}); + + res.addTransition(q0, 'a', q0); // initial loops + res.addTransition(q0, 'b', q0); + + res.addTransition(q0, 'a', q1); // 3 simple matching automatas + + res.addTransition(q1, 'a', q2); + res.addTransition(q1, 'b', q2); + res.addTransition(q4, 'a', q5); + res.addTransition(q4, 'b', q5); + + res.addTransition(q2, 'b', q3); + res.addTransition(q5, 'b', q6); + res.addTransition(q7, 'b', q8); + + res.addTransition(q0, 'b', q4); // error transitions for replace + + res.addTransition(q2, 'a', q6); + res.addTransition(q5, 'a', q8); + + res.addTransition(q0, q4); // delete transition + res.addTransition(q1, q5); + res.addTransition(q2, q6); + + res.addTransition(q4, q7); + res.addTransition(q5, q8); + + res.addTransition(q1, 'a', q4); + res.addTransition(q1, 'b', q4); + + res.addTransition(q2, 'a', q5); + res.addTransition(q2, 'b', q5); + + res.addTransition(q5, 'a', q7); + res.addTransition(q5, 'b', q7); + + CPPUNIT_ASSERT(resulting_automata == res); +} + void LevenshteinMatchingAutomatonTest::setUp() { } void LevenshteinMatchingAutomatonTest::tearDown() { } diff --git a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.h b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.h index ab650ce568..6ec3013fa8 100644 --- a/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.h +++ b/alib2algo/test-src/stringology/matching/LevenshteinMatchingAutomatonTest.h @@ -5,13 +5,15 @@ class LevenshteinMatchingAutomatonTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(LevenshteinMatchingAutomatonTest); - CPPUNIT_TEST(testSimpleConstruction); - CPPUNIT_TEST_SUITE_END(); + CPPUNIT_TEST(testSimpleConstruction); + CPPUNIT_TEST(testSimpleWildcardConstruction); + CPPUNIT_TEST_SUITE_END(); public: void setUp ( ); void tearDown ( ); void testSimpleConstruction(); + void testSimpleWildcardConstruction(); }; #endif //HAMMING_MATCHING_AUTOMATA_TEST_H_ -- GitLab