From 6efe67335d5e6434d5f3cdfbbd619da8db2b5139 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Capek?= <tomas@capek.io> Date: Mon, 30 Apr 2018 18:21:21 +0200 Subject: [PATCH] Implement simulation for aproximate string matching using bit paralelism and Levenshtein distance. --- .../simulations/LevenshteinBitParalelism.h | 84 +++++++++++++++++++ .../LevenshteinBitParalelismTest.cpp | 20 +++++ .../LevenshteinBitParalelismTest.h | 17 ++++ 3 files changed, 121 insertions(+) create mode 100644 alib2algo/src/stringology/simulations/LevenshteinBitParalelism.h create mode 100644 alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.cpp create mode 100644 alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.h diff --git a/alib2algo/src/stringology/simulations/LevenshteinBitParalelism.h b/alib2algo/src/stringology/simulations/LevenshteinBitParalelism.h new file mode 100644 index 0000000000..58e7f3b600 --- /dev/null +++ b/alib2algo/src/stringology/simulations/LevenshteinBitParalelism.h @@ -0,0 +1,84 @@ +/* + * LevenshteinBitParalelism.h + * + * Created on: 30.4.2018 + * Author: Tomas Capek + */ + +#ifndef _LEVENSHTEIN_BIT_PARALELISM_H__ +#define _LEVENSHTEIN_BIT_PARALELISM_H__ + +#include <exception> +#include <string/LinearString.h> + +#include "BitParalelism.h" + +namespace stringology { + +namespace simulations { + +class LevenshteinBitParalelism { +public: + template <class SymbolType> + static ext::vector<unsigned int> search(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern, unsigned int errors); +}; + + +template <class SymbolType> +ext::vector<unsigned int> LevenshteinBitParalelism::search(const string::LinearString<SymbolType> & text, const string::LinearString<SymbolType> & pattern, unsigned int errors) { + // preparation stage + ext::set<SymbolType> common_alphabet = text.getAlphabet(); + common_alphabet.insert(pattern.getAlphabet().begin(), pattern.getAlphabet().end()); + + ext::map<SymbolType, ext::vector<bool> > D_vectors = BitParalelism::constructDVectors(common_alphabet, pattern); + + auto V_vector = ext::vector<bool>(pattern.getContent().size(), 0); + V_vector[pattern.getContent().size() - 1] = 1; + + // computation part + ext::vector<unsigned int> result; + + ext::vector<ext::vector<bool> > B_vectors; + for(unsigned int i=0; i<=errors; i++) { + B_vectors.push_back(ext::vector<bool>(pattern.getContent().size(), 0)); + } + + for(unsigned int l = 0; l <= errors; l++) { + for(unsigned int j = l; j <= pattern.getContent().size(); j++) { + B_vectors[l][j] = 1; + } + } + + for(unsigned int i=0; i<text.getContent().size(); i++) { + ext::vector< ext::vector<bool> > previous_B_vectors = B_vectors; + + B_vectors[0] = (B_vectors[0] << 1) | D_vectors[text.getContent()[i]]; + + for(unsigned int j=1; j<=errors; j++) { + B_vectors[j] = ((previous_B_vectors[j] << 1) | D_vectors[text.getContent()[i]]) & + ( (previous_B_vectors[j-1] & B_vectors[j-1]) << 1) & + ( previous_B_vectors[j-1] | V_vector ); + } + + for (const auto & data : B_vectors) { + if(data[pattern.getContent().size()-1] == false) { + if (i < pattern.getContent().size()) { + if (result.size() == 0) { + result.push_back(0); + } + } else { + result.push_back(i - pattern.getContent().size() + 1); + } + break; + } + } + } + + return result; +} + +} // namespace simulations + +} // namespace stringology + +#endif /* _LEVENSHTEIN_BIT_PARALELISM_H__ */ diff --git a/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.cpp b/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.cpp new file mode 100644 index 0000000000..c8618a3b83 --- /dev/null +++ b/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.cpp @@ -0,0 +1,20 @@ +#include "LevenshteinBitParalelismTest.h" + +#include <string/LinearString.h> +#include <stringology/simulations/LevenshteinBitParalelism.h> + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION ( LevenshteinBitParalelismTest, "bit paralelism" ); +CPPUNIT_TEST_SUITE_REGISTRATION ( LevenshteinBitParalelismTest ); + +void LevenshteinBitParalelismTest::testSimple() { + auto text = string::LinearString<>("adcabcaabadbbca"); + auto pattern = string::LinearString<>("adbbca"); + + ext::vector<unsigned int> expected_result = {0, 1, 2, 4, 6, 7, 8, 9}; + auto result = stringology::simulations::LevenshteinBitParalelism::search(text, pattern, 3); + CPPUNIT_ASSERT(expected_result == result); +} + +void LevenshteinBitParalelismTest::setUp() { } + +void LevenshteinBitParalelismTest::tearDown() { } diff --git a/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.h b/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.h new file mode 100644 index 0000000000..89ad3fde39 --- /dev/null +++ b/alib2algo/test-src/stringology/simulations/LevenshteinBitParalelismTest.h @@ -0,0 +1,17 @@ +#ifndef LEVENSHTEIN_BIT_PARALELISM_TEST_H_ +#define LEVENSHTEIN_BIT_PARALELISM_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class LevenshteinBitParalelismTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(LevenshteinBitParalelismTest); + CPPUNIT_TEST(testSimple); + CPPUNIT_TEST_SUITE_END(); + +public: + void setUp ( ); + void tearDown ( ); + + void testSimple(); +}; +#endif // LEVENSHTEIN_BIT_PARALELISM_TEST_H_ -- GitLab