From 212d3576fc23afb0b8cae652d4b2e475a55397b5 Mon Sep 17 00:00:00 2001
From: Tomas Pecka <peckato1@fit.cvut.cz>
Date: Sun, 22 May 2022 16:28:46 +0200
Subject: [PATCH] algo: naive construction of tree factor oracle

---
 .../ExactFactorOracleAutomatonNaive.cpp       |   8 +
 .../ExactFactorOracleAutomatonNaive.h         | 175 ++++++++++++++++++
 factor.aql                                    |  16 ++
 3 files changed, 199 insertions(+)
 create mode 100644 alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp
 create mode 100644 alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h
 create mode 100644 factor.aql

diff --git a/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp
new file mode 100644
index 0000000000..e02a3fba18
--- /dev/null
+++ b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp
@@ -0,0 +1,8 @@
+#include "ExactFactorOracleAutomatonNaive.h"
+#include <registration/AlgoRegistration.hpp>
+
+namespace {
+
+auto FactorOracleAutomatonNaivePrefixRankedTree = registration::AbstractRegister < arbology::indexing::ExactFactorOracleAutomatonNaive, automaton::DPDA < common::ranked_symbol < DefaultSymbolType >, char, ext::set < unsigned > >, const tree::PrefixRankedTree < DefaultSymbolType > &, const common::ranked_symbol < > & > ( arbology::indexing::ExactFactorOracleAutomatonNaive::construct );
+
+} /* namespace */
diff --git a/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h
new file mode 100644
index 0000000000..eedd529afc
--- /dev/null
+++ b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h
@@ -0,0 +1,175 @@
+#pragma once
+
+#include <algorithm>
+#include "alphabet/Initial.h"
+#include "arbology/exact/ExactSubtreeAutomaton.h"
+#include "arbology/exact/ExactTreePatternAutomaton.h"
+#include "automaton/PDA/InputDrivenDPDA.h"
+#include "automaton/determinize/Determinize.h"
+#include "global/GlobalData.h"
+#include "tree/ranked/PrefixRankedTree.h"
+
+namespace {
+
+/**
+ * Finds similar states (to be merged by factor oracle) and select one representative (lexicographically first one) for each group.
+ * Returns mapping of original state to the representative of the group
+ */
+ext::map<ext::set<unsigned>, ext::set<unsigned>> mergeCommonStates(const ext::set<ext::set<unsigned>>& states)
+{
+    using State = ext::set<unsigned>;
+
+    ext::map<State, ext::set<State>> statesByDSubsets;
+
+    for (const auto& state : states) {
+        if (state.empty()) {
+            statesByDSubsets[state].insert(state);
+        } else {
+            statesByDSubsets[{*state.begin()}].insert(state);
+        }
+    }
+
+    if (common::GlobalData::verbose) {
+        for (const auto& [k, v] : statesByDSubsets) {
+            common::Streams::log << k << " -> " << v << std::endl;
+        }
+    }
+
+    ext::map<State, State> representatives;
+    for (const auto& [representative, statesSet] : statesByDSubsets) {
+        for (const auto& state : statesSet) {
+            representatives[state] = representative;
+        }
+    }
+    return representatives;
+}
+
+
+/** Determinization according to the subtree PDA oracle paper */
+template <class SymbolType>
+automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> trimDeterminize(const automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned>& automaton)
+{
+    using State = ext::set<unsigned>;
+    struct StateCompare {
+        bool operator()(const State& a, const State& b) const
+        {
+            return *a.begin() > *b.begin();
+        }
+    };
+
+    State initial({automaton.getInitialState()});
+
+    std::map<State, ext::set<unsigned>> cpds;
+    std::priority_queue<State, std::vector<State>, StateCompare> queue;
+
+    queue.push(initial);
+    cpds.emplace(initial, ext::set<unsigned>({1})); // TODO: We don't need the whole set. Its maximum value is sufficient because we just need to check that there is a value > 0
+
+    automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> res({initial}, automaton.getInputAlphabet(), automaton.getPushdownStoreAlphabet(), initial, automaton.getInitialSymbol(), {});
+    res.setPushdownStoreOperations(automaton.getPushdownStoreOperations());
+
+    while (!queue.empty()) {
+        State q = std::move(queue.top());
+        queue.pop();
+
+        if (auto it = cpds.find(q); !it->second.empty() && std::any_of(it->second.begin(), it->second.end(), [](const unsigned& n) { return n > 0; })) {
+            for (const auto& input : automaton.getInputAlphabet()) {
+                State target;
+
+                // collect target transitions for (q, symbol)
+                for (const auto& state : q) {
+                    for (const auto& tr : automaton.getTransitions().equal_range(ext::make_pair(state, input))) {
+                        target.emplace(tr.second);
+                    }
+                }
+
+                // empty target state is not interesting
+                if (target.empty()) {
+                    continue;
+                }
+
+                if (!res.getStates().contains(target)) {
+                    queue.push(target);
+                    res.addState(target);
+                    cpds.emplace(target, ext::set<unsigned>());
+                }
+
+                res.addTransition(q, input, target);
+
+                // update cpds
+                const auto& [pop, push] = automaton.getPushdownStoreOperations().find(input)->second;
+                for (const unsigned& pds : cpds[q]) {
+                    cpds[target].insert(pds - pop.size() + push.size());
+                }
+            }
+        }
+    }
+
+    return res;
+}
+
+}
+
+namespace arbology::indexing {
+
+class ExactFactorOracleAutomatonNaive {
+public:
+    /**
+     * Constructs factor oracle automaton for given linear prefix ranked tree.
+     * @return factor oracle automaton for given pattern
+     */
+    template <class SymbolType>
+    static automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> construct(const tree::PrefixRankedTree<SymbolType>& tree, const common::ranked_symbol<SymbolType>& wildcard)
+    {
+#if 1
+        automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned> subtreeNPDA = arbology::exact::ExactTreePatternAutomaton::construct(tree, wildcard);
+#else
+        (void)wildcard;
+        automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned> subtreeNPDA = arbology::exact::ExactSubtreeAutomaton::construct(tree);
+#endif
+        automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> subtreePDA = trimDeterminize(subtreeNPDA);
+
+#if 0
+        auto res = automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>>(subtreePDA.getStates(), subtreePDA.getInputAlphabet(), subtreePDA.getPushdownStoreAlphabet(), subtreePDA.getInitialState(), subtreePDA.getInitialSymbol(), {});
+        for (const auto& [kvFromInput, target] : subtreePDA.getTransitions()) {
+            const auto& [from, input] = kvFromInput;
+            const auto& [pop, push] = subtreePDA.getPushdownStoreOperations().find(input)->second;
+            res.addTransition(from, input, pop, target, push);
+        }
+        return res;
+#endif
+
+        auto representatives = mergeCommonStates(subtreePDA.getStates());
+
+        ext::set<ext::set<unsigned>> newStates;
+        std::transform(representatives.begin(), representatives.end(), std::inserter(newStates, newStates.begin()), [](const auto& kv) { return kv.second; });
+
+        auto oracle = automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>>(newStates, subtreePDA.getInputAlphabet(), subtreePDA.getPushdownStoreAlphabet(), representatives[subtreePDA.getInitialState()], subtreePDA.getInitialSymbol(), {});
+
+        for (const auto& [fromSymbol, target] : subtreePDA.getTransitions()) {
+            const auto& [from, symbol] = fromSymbol;
+            const auto& [pop, push] = subtreePDA.getPushdownStoreOperations().find(symbol)->second;
+
+            bool toAdd = true;
+
+            // if already exists, remove the "longer" transition (Melichar, Stringology, p.103 3.28)
+            if (auto it = oracle.getTransitions().find(ext::make_tuple(representatives[from], symbol, pop)); it != oracle.getTransitions().end()) {
+                if (*it->second.first.begin() > *(representatives[target].begin())) {
+                    oracle.removeTransition(std::get<0>(it->first), std::get<1>(it->first), std::get<2>(it->first), it->second.first, it->second.second);
+                } else {
+                    // this one leads "longer", i.e., do not add
+                    toAdd = false;
+                }
+            }
+
+            if (toAdd) {
+                /* common::Streams::log << representatives[from] << ", " << symbol << ", " << pop << " -> " << representatives[target] << ", " << push << std::endl; */
+                oracle.addTransition(representatives[from], symbol, pop, representatives[target], push);
+            }
+        }
+
+        return oracle;
+    }
+};
+
+} /* namespace arbology::indexing */
diff --git a/factor.aql b/factor.aql
new file mode 100644
index 0000000000..3420295dac
--- /dev/null
+++ b/factor.aql
@@ -0,0 +1,16 @@
+execute tree::generate::RandomRankedPatternFactory 5 10 2 true 3 > $randomPattern
+execute component::SubtreeWildcardSymbol::get $randomPattern > $S
+
+// execute string::Parse @Tree "RANKED_TREE a 2 a 1 a 1 a 0 a 0 " > $subject
+// execute string::Parse @Tree "RANKED_TREE b 2 b 0 a 2 a 0 a 2 a 0 a 0" > $subject
+execute string::Parse @Tree "RANKED_TREE a 4 a 4 a 4 a 4 a 0 b 0 a 0 a 0 a 0 b 0 a 0 a 0 a 0 b 0 b 0 a 0 a 0 " > $subject
+execute string::Parse @Tree "RANKED_TREE b 2 a 1 a 1 a 0 b 2 a 1 a 0 c 0" > $subject
+
+execute string::Parse @String "\"a4 a4 a4 a4 a0 b0 a0 a0 a0 b0 a0 a0 a0 b0 b0 a0 a0\""  > $subject2
+execute string::Parse @String "\" b2 a1 a1 a0 b2 a1 a0 c0 \""  > $subject2
+
+execute ExactFactorOracleAutomatonNaive $subject $S > $oracle
+
+print $oracle | DotConverter - | Dot -
+/* print ExactTreePatternAutomaton (PrefixRankedTree) $subject $S | DotConverter - | Dot - */
+/* print ExactFactorOracleAutomaton $subject2 | DotConverter - | Dot - */
-- 
GitLab