From b13ebefe3dd80dd9ff0b44a1bad9b657ca809a82 Mon Sep 17 00:00:00 2001 From: Tomas Capek <tomas@capek.io> Date: Thu, 3 May 2018 16:50:25 +0200 Subject: [PATCH] Implement simulation for aproximate string matching using dynamic programming and General Levenshtein distance. --- ...GeneralizedLevenshteinDynamicProgramming.h | 89 +++++++++++++++++++ ...lizedLevenshteinDynamicProgrammingTest.cpp | 19 +++- ...ralizedLevenshteinDynamicProgrammingTest.h | 2 + 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 alib2algo/src/stringology/simulations/GeneralizedLevenshteinDynamicProgramming.h diff --git a/alib2algo/src/stringology/simulations/GeneralizedLevenshteinDynamicProgramming.h b/alib2algo/src/stringology/simulations/GeneralizedLevenshteinDynamicProgramming.h new file mode 100644 index 0000000000..e4867a2ee0 --- /dev/null +++ b/alib2algo/src/stringology/simulations/GeneralizedLevenshteinDynamicProgramming.h @@ -0,0 +1,89 @@ +/* + * LevenshteinDynamicProgramming.h + * + * Created on: 1.5.2018 + * Author: Tomas Capek + */ + +#ifndef _GENERALIZED_LEVENSHTEIN_DYNAMIC_PROGRAMMING_H__ +#define _GENERALIZED_LEVENSHTEIN_DYNAMIC_PROGRAMMING_H__ + +#include <algorithm> +#include <limits.h> + +#include <string/LinearString.h> + +namespace stringology { + +namespace simulations { + +class GeneralizedLevenshteinDynamicProgramming { +public: + template <class SymbolType> + static ext::vector<ext::vector<unsigned int>> compute_table(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern); + + template <class SymbolType> + static ext::set<unsigned int> search(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern, unsigned int errors); +}; + +template <class SymbolType> +ext::vector<ext::vector<unsigned int>> GeneralizedLevenshteinDynamicProgramming::compute_table(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern) { + ext::vector< ext::vector <unsigned int> > table = + ext::vector<ext::vector<unsigned int> > ( + pattern.getContent().size() + 1, + ext::vector<unsigned int>(text.getContent().size() + 1, 0) + ); + + for(unsigned int j = 0; j <= pattern.getContent().size(); j++) { + table[j][0] = j; + } + + for(unsigned int i = 1; i<=text.getContent().size(); i++) { + for(unsigned int j = 1; j<=pattern.getContent().size(); j++) { + unsigned int value_a; + if(pattern.getContent()[j-1] == text.getContent()[i-1]) { + value_a = table[j-1][i-1]; + } else { + value_a = table[j-1][i-1] + 1; + } + + unsigned int value_b = UINT_MAX; + if(j < pattern.getContent().size()) { + value_b = table[j][i-1] + 1; + } + + value_b = std::min(table[j-1][i] + 1, value_b); + + unsigned int value_c = UINT_MAX; + if(j>1 && i>1 && pattern.getContent()[j-2] == text.getContent()[i-1] && pattern.getContent()[j-1] == text.getContent()[i-2]) { + value_c = table[j-2][i-2] + 1; + } + + table[j][i] = std::min({value_a, value_b, value_c}); + } + } + + return table; +} + +template <class SymbolType> +ext::set<unsigned int> GeneralizedLevenshteinDynamicProgramming::search(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern, unsigned int errors) { + auto table = GeneralizedLevenshteinDynamicProgramming::compute_table(text, pattern); + + ext::set<unsigned int> result; + + for(unsigned int i = 0; i<= text.getContent().size(); i++) { + if(table[pattern.getContent().size()][i] <= errors) { + result.insert(i-1); + } + } + + return result; +} + + +} // namespace simulations + +} // namespace stringology + +#endif /* _GENERALIZED_LEVENSHTEIN_DYNAMIC_PROGRAMMING_H__ */ diff --git a/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.cpp b/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.cpp index 229f053d3c..142e32686f 100644 --- a/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.cpp +++ b/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.cpp @@ -11,12 +11,27 @@ void GeneralizedLevenshteinDynamicProgrammingTest::testTableConstruction() { auto pattern = string::LinearString<>("adbbca"); ext::vector<ext::vector<unsigned int>> expected_result = { - ext::vector<unsigned int>({}), + ext::vector<unsigned int>({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), + ext::vector<unsigned int>({1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0}), + ext::vector<unsigned int>({2, 1, 0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 2, 2, 1}), + ext::vector<unsigned int>({3, 2, 1, 0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 2, 2}), + ext::vector<unsigned int>({4, 3, 2, 1, 1, 1, 2, 3, 2, 2, 2, 1, 0, 1, 2}), + ext::vector<unsigned int>({5, 4, 3, 2, 1, 1, 2, 3, 3, 3, 3, 2, 1, 0, 1}), + ext::vector<unsigned int>({6, 5, 4, 3, 2, 2, 1, 2, 4, 3, 4, 3, 2, 1, 0}), }; - CPPUNIT_ASSERT(expected_result == stringology::simulations::GeneralizedLevenshteinDynamicProgramming::compute_table(text, pattern, 3)); + CPPUNIT_ASSERT(expected_result == stringology::simulations::GeneralizedLevenshteinDynamicProgramming::compute_table(text, pattern)); } +void GeneralizedLevenshteinDynamicProgrammingTest::testSearch() { + auto text = string::LinearString<>("adbcbaabadbbca"); + auto pattern = string::LinearString<>("adbbca"); + + ext::set<unsigned int> expected_result = {2, 3, 4, 5, 6, 8, 10, 11, 12, 13}; + auto result = stringology::simulations::GeneralizedLevenshteinDynamicProgramming::search(text, pattern, 3); + + CPPUNIT_ASSERT(expected_result == result); +} void GeneralizedLevenshteinDynamicProgrammingTest::setUp() { } diff --git a/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.h b/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.h index 04d54222db..3c044661d4 100644 --- a/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.h +++ b/alib2algo/test-src/stringology/simulations/GeneralizedLevenshteinDynamicProgrammingTest.h @@ -6,6 +6,7 @@ class GeneralizedLevenshteinDynamicProgrammingTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(GeneralizedLevenshteinDynamicProgrammingTest); CPPUNIT_TEST(testTableConstruction); + CPPUNIT_TEST(testSearch); CPPUNIT_TEST_SUITE_END(); public: @@ -13,5 +14,6 @@ public: void tearDown ( ); void testTableConstruction(); + void testSearch(); }; #endif // LEVENSHTEIN_DYNAMIC_PROGRAMMING_TEST_H_ -- GitLab