From 212d3576fc23afb0b8cae652d4b2e475a55397b5 Mon Sep 17 00:00:00 2001 From: Tomas Pecka <peckato1@fit.cvut.cz> Date: Sun, 22 May 2022 16:28:46 +0200 Subject: [PATCH] algo: naive construction of tree factor oracle --- .../ExactFactorOracleAutomatonNaive.cpp | 8 + .../ExactFactorOracleAutomatonNaive.h | 175 ++++++++++++++++++ factor.aql | 16 ++ 3 files changed, 199 insertions(+) create mode 100644 alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp create mode 100644 alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h create mode 100644 factor.aql diff --git a/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp new file mode 100644 index 0000000000..e02a3fba18 --- /dev/null +++ b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.cpp @@ -0,0 +1,8 @@ +#include "ExactFactorOracleAutomatonNaive.h" +#include <registration/AlgoRegistration.hpp> + +namespace { + +auto FactorOracleAutomatonNaivePrefixRankedTree = registration::AbstractRegister < arbology::indexing::ExactFactorOracleAutomatonNaive, automaton::DPDA < common::ranked_symbol < DefaultSymbolType >, char, ext::set < unsigned > >, const tree::PrefixRankedTree < DefaultSymbolType > &, const common::ranked_symbol < > & > ( arbology::indexing::ExactFactorOracleAutomatonNaive::construct ); + +} /* namespace */ diff --git a/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h new file mode 100644 index 0000000000..eedd529afc --- /dev/null +++ b/alib2algo_experimental/src/arbology/indexing/ExactFactorOracleAutomatonNaive.h @@ -0,0 +1,175 @@ +#pragma once + +#include <algorithm> +#include "alphabet/Initial.h" +#include "arbology/exact/ExactSubtreeAutomaton.h" +#include "arbology/exact/ExactTreePatternAutomaton.h" +#include "automaton/PDA/InputDrivenDPDA.h" +#include "automaton/determinize/Determinize.h" +#include "global/GlobalData.h" +#include "tree/ranked/PrefixRankedTree.h" + +namespace { + +/** + * Finds similar states (to be merged by factor oracle) and select one representative (lexicographically first one) for each group. + * Returns mapping of original state to the representative of the group + */ +ext::map<ext::set<unsigned>, ext::set<unsigned>> mergeCommonStates(const ext::set<ext::set<unsigned>>& states) +{ + using State = ext::set<unsigned>; + + ext::map<State, ext::set<State>> statesByDSubsets; + + for (const auto& state : states) { + if (state.empty()) { + statesByDSubsets[state].insert(state); + } else { + statesByDSubsets[{*state.begin()}].insert(state); + } + } + + if (common::GlobalData::verbose) { + for (const auto& [k, v] : statesByDSubsets) { + common::Streams::log << k << " -> " << v << std::endl; + } + } + + ext::map<State, State> representatives; + for (const auto& [representative, statesSet] : statesByDSubsets) { + for (const auto& state : statesSet) { + representatives[state] = representative; + } + } + return representatives; +} + + +/** Determinization according to the subtree PDA oracle paper */ +template <class SymbolType> +automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> trimDeterminize(const automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned>& automaton) +{ + using State = ext::set<unsigned>; + struct StateCompare { + bool operator()(const State& a, const State& b) const + { + return *a.begin() > *b.begin(); + } + }; + + State initial({automaton.getInitialState()}); + + std::map<State, ext::set<unsigned>> cpds; + std::priority_queue<State, std::vector<State>, StateCompare> queue; + + queue.push(initial); + cpds.emplace(initial, ext::set<unsigned>({1})); // TODO: We don't need the whole set. Its maximum value is sufficient because we just need to check that there is a value > 0 + + automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> res({initial}, automaton.getInputAlphabet(), automaton.getPushdownStoreAlphabet(), initial, automaton.getInitialSymbol(), {}); + res.setPushdownStoreOperations(automaton.getPushdownStoreOperations()); + + while (!queue.empty()) { + State q = std::move(queue.top()); + queue.pop(); + + if (auto it = cpds.find(q); !it->second.empty() && std::any_of(it->second.begin(), it->second.end(), [](const unsigned& n) { return n > 0; })) { + for (const auto& input : automaton.getInputAlphabet()) { + State target; + + // collect target transitions for (q, symbol) + for (const auto& state : q) { + for (const auto& tr : automaton.getTransitions().equal_range(ext::make_pair(state, input))) { + target.emplace(tr.second); + } + } + + // empty target state is not interesting + if (target.empty()) { + continue; + } + + if (!res.getStates().contains(target)) { + queue.push(target); + res.addState(target); + cpds.emplace(target, ext::set<unsigned>()); + } + + res.addTransition(q, input, target); + + // update cpds + const auto& [pop, push] = automaton.getPushdownStoreOperations().find(input)->second; + for (const unsigned& pds : cpds[q]) { + cpds[target].insert(pds - pop.size() + push.size()); + } + } + } + } + + return res; +} + +} + +namespace arbology::indexing { + +class ExactFactorOracleAutomatonNaive { +public: + /** + * Constructs factor oracle automaton for given linear prefix ranked tree. + * @return factor oracle automaton for given pattern + */ + template <class SymbolType> + static automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> construct(const tree::PrefixRankedTree<SymbolType>& tree, const common::ranked_symbol<SymbolType>& wildcard) + { +#if 1 + automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned> subtreeNPDA = arbology::exact::ExactTreePatternAutomaton::construct(tree, wildcard); +#else + (void)wildcard; + automaton::InputDrivenNPDA<common::ranked_symbol<SymbolType>, char, unsigned> subtreeNPDA = arbology::exact::ExactSubtreeAutomaton::construct(tree); +#endif + automaton::InputDrivenDPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>> subtreePDA = trimDeterminize(subtreeNPDA); + +#if 0 + auto res = automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>>(subtreePDA.getStates(), subtreePDA.getInputAlphabet(), subtreePDA.getPushdownStoreAlphabet(), subtreePDA.getInitialState(), subtreePDA.getInitialSymbol(), {}); + for (const auto& [kvFromInput, target] : subtreePDA.getTransitions()) { + const auto& [from, input] = kvFromInput; + const auto& [pop, push] = subtreePDA.getPushdownStoreOperations().find(input)->second; + res.addTransition(from, input, pop, target, push); + } + return res; +#endif + + auto representatives = mergeCommonStates(subtreePDA.getStates()); + + ext::set<ext::set<unsigned>> newStates; + std::transform(representatives.begin(), representatives.end(), std::inserter(newStates, newStates.begin()), [](const auto& kv) { return kv.second; }); + + auto oracle = automaton::DPDA<common::ranked_symbol<SymbolType>, char, ext::set<unsigned>>(newStates, subtreePDA.getInputAlphabet(), subtreePDA.getPushdownStoreAlphabet(), representatives[subtreePDA.getInitialState()], subtreePDA.getInitialSymbol(), {}); + + for (const auto& [fromSymbol, target] : subtreePDA.getTransitions()) { + const auto& [from, symbol] = fromSymbol; + const auto& [pop, push] = subtreePDA.getPushdownStoreOperations().find(symbol)->second; + + bool toAdd = true; + + // if already exists, remove the "longer" transition (Melichar, Stringology, p.103 3.28) + if (auto it = oracle.getTransitions().find(ext::make_tuple(representatives[from], symbol, pop)); it != oracle.getTransitions().end()) { + if (*it->second.first.begin() > *(representatives[target].begin())) { + oracle.removeTransition(std::get<0>(it->first), std::get<1>(it->first), std::get<2>(it->first), it->second.first, it->second.second); + } else { + // this one leads "longer", i.e., do not add + toAdd = false; + } + } + + if (toAdd) { + /* common::Streams::log << representatives[from] << ", " << symbol << ", " << pop << " -> " << representatives[target] << ", " << push << std::endl; */ + oracle.addTransition(representatives[from], symbol, pop, representatives[target], push); + } + } + + return oracle; + } +}; + +} /* namespace arbology::indexing */ diff --git a/factor.aql b/factor.aql new file mode 100644 index 0000000000..3420295dac --- /dev/null +++ b/factor.aql @@ -0,0 +1,16 @@ +execute tree::generate::RandomRankedPatternFactory 5 10 2 true 3 > $randomPattern +execute component::SubtreeWildcardSymbol::get $randomPattern > $S + +// execute string::Parse @Tree "RANKED_TREE a 2 a 1 a 1 a 0 a 0 " > $subject +// execute string::Parse @Tree "RANKED_TREE b 2 b 0 a 2 a 0 a 2 a 0 a 0" > $subject +execute string::Parse @Tree "RANKED_TREE a 4 a 4 a 4 a 4 a 0 b 0 a 0 a 0 a 0 b 0 a 0 a 0 a 0 b 0 b 0 a 0 a 0 " > $subject +execute string::Parse @Tree "RANKED_TREE b 2 a 1 a 1 a 0 b 2 a 1 a 0 c 0" > $subject + +execute string::Parse @String "\"a4 a4 a4 a4 a0 b0 a0 a0 a0 b0 a0 a0 a0 b0 b0 a0 a0\"" > $subject2 +execute string::Parse @String "\" b2 a1 a1 a0 b2 a1 a0 c0 \"" > $subject2 + +execute ExactFactorOracleAutomatonNaive $subject $S > $oracle + +print $oracle | DotConverter - | Dot - +/* print ExactTreePatternAutomaton (PrefixRankedTree) $subject $S | DotConverter - | Dot - */ +/* print ExactFactorOracleAutomaton $subject2 | DotConverter - | Dot - */ -- GitLab