From e41c9c0c9c3108a7ca7807c6c50fb9d0c82ce166 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radovan=20=C4=8Cerven=C3=BD?= <radovan.cerveny@gmail.com>
Date: Sat, 2 Apr 2016 15:30:54 +0200
Subject: [PATCH] suffix automaton construction fix, backward dawg matching
 implemented

---
 .../exact/BackwardDAWGMatching.cpp            | 79 +++++++++++++++++++
 .../exact/BackwardDAWGMatching.hpp            | 43 ++++++++++
 .../src/stringology/exact/SuffixAutomaton.cpp |  9 ++-
 .../exact/FactorOracleAutomatonTest.cpp       |  4 +-
 .../stringology/exact/SuffixAutomatonTest.cpp | 35 ++++++++
 .../stringology/exact/SuffixAutomatonTest.h   |  3 +
 6 files changed, 169 insertions(+), 4 deletions(-)
 create mode 100644 alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp
 create mode 100644 alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp

diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp
new file mode 100644
index 0000000000..1bfbaa363c
--- /dev/null
+++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.cpp
@@ -0,0 +1,79 @@
+/*
+ * Author: Radovan Cerveny
+ */
+
+#include "BackwardDAWGMatching.hpp"
+#include "SuffixAutomaton.hpp"
+
+#include <exception/AlibException.h>
+#include <string/LinearString.h>
+#include <alphabet/Symbol.h>
+
+#include <algorithm>
+#include <map>
+#include <bitset>
+#include <measure>
+
+namespace stringology {
+
+namespace exact {
+
+std::set < unsigned > BackwardDAWGMatching::match ( const string::String & subject, const string::String & pattern ) {
+    return getInstance ( ).dispatch ( subject.getData ( ), pattern.getData ( ) );
+}
+
+std::set < unsigned > BackwardDAWGMatching::match ( const string::LinearString & subject, const string::LinearString & pattern ) {
+    std::set < unsigned > occ;
+
+    measurements::start ( "Preprocess", measurements::Type::PREPROCESS );
+
+    auto patternData = pattern.getContent ( );
+
+    reverse ( patternData.begin ( ), patternData.end ( ) );
+
+    const string::LinearString reversedPattern ( std::move ( patternData ) );
+
+    automaton::DFA suffixAutomaton = SuffixAutomaton::construct ( reversedPattern );
+
+    measurements::end ( );
+
+    measurements::start ( "Algorithm", measurements::Type::ALGORITHM );
+
+    const automaton::State failState = automaton::State ( -1 );
+
+    size_t posInSubject = 0;
+
+    while ( posInSubject <= subject.getContent ( ).size ( ) - pattern.getContent ( ).size ( ) ) {
+
+        automaton::State currentState = suffixAutomaton.getInitialState ( );
+
+        size_t posInPattern = reversedPattern.getContent ( ).size ( );
+
+        while ( posInPattern > 0 && currentState != failState ) {
+            auto transition = suffixAutomaton.getTransitions ( ).find ( { currentState, subject.getContent ( ).at ( posInSubject + posInPattern - 1 ) } );
+
+            if ( transition == suffixAutomaton.getTransitions ( ).end ( ) )
+                currentState = failState;
+            else
+                currentState = transition->second;
+
+            posInPattern--;
+        }
+
+        if ( currentState != failState )
+             // Yay, there is match!!!
+            occ.insert ( posInSubject );
+
+        posInSubject += posInPattern + 1;
+    }
+
+    measurements::end ( );
+
+    return occ;
+}
+
+auto BackwardDAWGMatchingLinearStringLinearString = BackwardDAWGMatching::RegistratorWrapper < std::set < unsigned >, string::LinearString, string::LinearString > ( BackwardDAWGMatching::getInstance ( ), BackwardDAWGMatching::match );
+
+} /* namespace exact */
+
+} /* namespace stringology */
diff --git a/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp
new file mode 100644
index 0000000000..382fa52c9f
--- /dev/null
+++ b/alib2algo/src/stringology/exact/BackwardDAWGMatching.hpp
@@ -0,0 +1,43 @@
+/*
+ * Author: Radovan Cerveny
+ */
+
+#ifndef STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__
+#define STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__
+
+#include <string/String.h>
+#include <string/StringFeatures.h>
+#include <core/multipleDispatch.hpp>
+
+#include <set>
+
+namespace stringology {
+
+namespace exact {
+
+/**
+ * Implementation of Backward DAWG Matching.
+ */
+class BackwardDAWGMatching : public std::DoubleDispatch < std::set < unsigned >, string::StringBase, string::StringBase > {
+private:
+public:
+    /**
+     * Search for pattern in linear string.
+     * @return set set of occurences
+     */
+    static std::set < unsigned > match ( const string::String & subject, const string::String & pattern );
+    static std::set < unsigned > match ( const string::LinearString & subject, const string::LinearString & pattern );
+
+    static BackwardDAWGMatching & getInstance ( ) {
+        static BackwardDAWGMatching res;
+
+        return res;
+    }
+
+};
+
+} /* namespace exact */
+
+} /* namespace stringology */
+
+#endif /* STRINGOLOGY_BACKWARD_ORACLE_MATCHING_HPP__ */
diff --git a/alib2algo/src/stringology/exact/SuffixAutomaton.cpp b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp
index 2ea1125669..190e23d41a 100644
--- a/alib2algo/src/stringology/exact/SuffixAutomaton.cpp
+++ b/alib2algo/src/stringology/exact/SuffixAutomaton.cpp
@@ -108,13 +108,18 @@ void SuffixAutomaton::suffixAutomatonAddSymbol ( automaton::DFA & suffixAutomato
             for ( const auto & transition : suffixAutomaton.getTransitionsFromState ( qState ) )
                 suffixAutomaton.addTransition ( cloneState, transition.first.second, transition.second );
 
-            while ( kState != automaton::State ( -1 ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( ) && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) {
+            while ( kState != automaton::State ( -1 )
+                && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } ) != suffixAutomaton.getTransitions ( ).end ( )
+                && suffixAutomaton.getTransitions ( ).find ( { kState, symbol } )->second == qState ) {
+                suffixAutomaton.removeTransition ( kState, symbol, qState );
                 suffixAutomaton.addTransition ( kState, symbol, cloneState );
                 kState = suffixLinks.find ( kState )->second.first;
             }
+
+            suffixLinks.find ( qState )->second.first = cloneState;
+            suffixLinks.find ( newState )->second.first = cloneState;
         }
     }
-
     lastState = newState;
 }
 
diff --git a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp
index eeac98a506..fa23d8bf8e 100644
--- a/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp
+++ b/alib2algo/test-src/stringology/exact/FactorOracleAutomatonTest.cpp
@@ -72,8 +72,8 @@ void FactorOracleAutomatonTest::testBackwardOracleMatching ( ) {
         CPPUNIT_ASSERT ( res == expectedOccs[i] );
     }
 
-    auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64 * 64, 512, false, true);
-    auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32 * 32, longSubject );
+    auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true);
+    auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject );
     std::set < unsigned > res = stringology::exact::BackwardOracleMatching::match ( longSubject, longPattern );
     std::cout << "long: " << res << std::endl;
     CPPUNIT_ASSERT ( res.size() > 0 );
diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp
index be2bbe4416..bee259d485 100644
--- a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp
+++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.cpp
@@ -2,6 +2,7 @@
 
 #include "string/LinearString.h"
 #include "stringology/exact/SuffixAutomaton.hpp"
+#include "stringology/exact/BackwardDAWGMatching.hpp"
 
 #include "string/generate/RandomStringFactory.h"
 #include "string/generate/RandomSubstringFactory.h"
@@ -47,3 +48,37 @@ void SuffixAutomatonTest::testSuffixAutomatonConstruction ( ) {
 
     CPPUNIT_ASSERT ( suffixAutomaton == refSuffixAutomaton );
 }
+
+void SuffixAutomatonTest::testBackwardDAWGMatching ( ) {
+    std::vector<std::string> subjects;
+    std::vector<std::string> patterns;
+    std::vector<std::set<unsigned>> expectedOccs;
+
+    subjects.push_back("a"); patterns.push_back("a"); expectedOccs.push_back({0});
+    subjects.push_back("a"); patterns.push_back("b"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfa"); expectedOccs.push_back({0});
+    subjects.push_back("alfalfalfa"); patterns.push_back("blfalfalfa"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfa"); patterns.push_back("alfalfalfb"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({0});
+    subjects.push_back("alfalfalfaalfalfalfaabfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfa"); expectedOccs.push_back({});
+    subjects.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); patterns.push_back("alfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfalfalfaalfaa"); expectedOccs.push_back({0});
+    subjects.push_back("atggccttgcc"); patterns.push_back("gcc"); expectedOccs.push_back({3,8});
+    subjects.push_back("aaaaaaaaaa"); patterns.push_back("a"); expectedOccs.push_back({0,1,2,3,4,5,6,7,8,9});
+
+
+    for(size_t i = 0; i < subjects.size(); ++i) {
+        string::String subject = string::stringFrom ( subjects[i] );
+        string::String pattern = string::stringFrom ( patterns[i] );
+        std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( subject, pattern );
+        std::cout << subjects[i] << ' ' << patterns[i] << ' ' << res << std::endl;
+        CPPUNIT_ASSERT ( res == expectedOccs[i] );
+    }
+
+    auto longSubject = string::generate::RandomStringFactory::generateLinearString (64 * 64, 512, false, true);
+    auto longPattern = string::generate::RandomSubstringFactory::generateSubstring(64 * 32, longSubject );
+    std::set < unsigned > res = stringology::exact::BackwardDAWGMatching::match ( longSubject, longPattern );
+    std::cout << "long: " << res << std::endl;
+    CPPUNIT_ASSERT ( res.size() > 0 );
+
+}
+
diff --git a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h
index edfc02045c..2154792b1d 100644
--- a/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h
+++ b/alib2algo/test-src/stringology/exact/SuffixAutomatonTest.h
@@ -6,6 +6,7 @@
 class SuffixAutomatonTest : public CppUnit::TestFixture {
     CPPUNIT_TEST_SUITE ( SuffixAutomatonTest );
     CPPUNIT_TEST ( testSuffixAutomatonConstruction );
+    CPPUNIT_TEST ( testBackwardDAWGMatching );
     CPPUNIT_TEST_SUITE_END ( );
 
 public:
@@ -13,6 +14,8 @@ public:
     void tearDown ( );
 
     void testSuffixAutomatonConstruction ( );
+    void testBackwardDAWGMatching ( );
+
 };
 
 #endif // SUFFIX_AUTOMATON_TEST_HPP_
-- 
GitLab