Skip to content
Snippets Groups Projects
ExperimentalCompactSuffixAutomatonConstruct.h 9.73 KiB
Newer Older
  • Learn to ignore specific revisions
  • #include <indexes/stringology/CompactSuffixAutomatonTerminatingSymbol.h>
    #include <string/LinearStringTerminatingSymbol.h>
    #include <alphabet/EndSymbol.h>
    #include <string/LinearString.h>
    
    namespace stringology {
    
    namespace indexing {
    
    class ExperimentalCompactSuffixAutomatonConstruct {
    
    	template <class SymbolType>
    	class CompactSuffixAutomatonConstructInt {
    		int nil;
    		int newVertexNumber;//hodnota použitá při vytvoření nového vrcholu
    		int e;
    		int T;//T == ┴
    		int source;
    		int sink;
    		int m; //velikost abecedy
    		ext::map<int,SymbolType> w; //slovo, kde na indexech -m až -1 je abeceda
    		int wLen;//délka slova w
    		SymbolType endChar;//znak, kterým končí řetězec - unikátní v celém řetězci. V článku $
    
    		//         vrchol  popis-hrany koncový-vrchol
    		//         v            v	  v
    		ext::map<int,ext::map<ext::pair<int,int>,int>> edges;
    		ext::map<int,int> suf;
    		ext::map<int,int> length;
    
    		int createNode() {
    			edges.insert({newVertexNumber,ext::map<ext::pair<int,int>,int>()});
    			return newVertexNumber++;
    		}
    
    		void createEdge(int fromVertex, int first, int second, int toVertex) {
    			edges.at(fromVertex).insert({{first, second},toVertex});
    		}
    
    		//hledání w[k]-edge. Použito na několika místech v algoritmu.
    		//char c je w[k]
    
    		void wkEdge(int s, const SymbolType & c, int & sc, int & kc, int & pc) {
    
    			for ( const std::pair < const ext::pair < int, int >, int > edge : edges.at ( s ) ) {
    				if ( w.at ( edge.first.first ) == c) {
    					sc = edge.second;
    					kc = edge.first.first;
    					pc = edge.first.second;
    					return;
    				}
    
    			}
    			throw "neniHrana";
    		}
    
    		//pomocná funkce, která reprezentuje řádek 3 ve funkci split_edge()
    		void replaceTheEdgeBy(int s, int k, int p, int sc, int kc, int pc, int r) {
    			edges.at(s).erase({kc,pc});
    
    			createEdge(s,kc,kc+p-k,r);
    			createEdge(r,kc+p-k+1,pc,sc);
    		}
    
    		//pomocná funkce ve funkci redirect_edge řádek 2
    		void replaceTheEdgeByEdge(int s, int k, int p, int kc, int pc, int r) {
    			edges.at(s).erase({kc,pc});
    
    			createEdge(s,kc,kc+p-k,r);
    		}
    
    		//pomocná funkce ve funkci separate_node řádek 9
    		void replaceThewkEdge(int s, int k, int p, int rc) {
    			int kcc,scc,pcc;
    			wkEdge(s,w.at(k),scc,kcc,pcc);
    
    			edges.at(s).erase({kcc,pcc});
    			createEdge(s,k,p,rc);
    		}
    
    		//pomocná funkce, která reprezentuje řádek 4 ve funkci check_end_point
    
    		bool thereIsACedgeFromS(int s, const SymbolType & c) {
    
    			//procházím hrany vrcholu s a hledám, jestli se první znak hrany rovná c
    			for ( const std::pair < const ext::pair < int, int >, int > & edge : edges.at ( s ) )
    				if ( w.at ( edge.first.first ) == c )
    					return true;
    
    			return false;
    		}
    
    		//tvoří kopii vrcholu. (Překopíruje hrany)
    		int duplicationOf(int sc) {
    			int rc = createNode();
    
    			for ( const std::pair < const ext::pair < int, int >, int > & edge : edges.at ( sc ) )
    				createEdge ( rc, edge.first.first, edge.first.second, edge.second );
    
    			return rc;
    		}
    
    		//konec pomocných funkcí. Dále je co nejpřesnější přepis pseudokodu
    
    
    		bool check_end_point(int s,int k,int p, const SymbolType & c) {
    
    			if(k<= p) {
    				int sc,kc,pc;//s' k' p'
    				wkEdge(s,w.at(k),sc,kc,pc);
    				return (c == w.at(kc+p-k+1));
    			}
    			return thereIsACedgeFromS(s,c);
    		}
    
    		int split_edge(int s,int k,int p) {
    			int sc,kc,pc;//s' k' p'
    			wkEdge(s,w.at(k),sc,kc,pc);
    
    			int r = createNode();
    			replaceTheEdgeBy(s,k,p,sc,kc,pc,r);
    
    			int len = length.at(s);
    			if(len == INT_MAX)
    				len = e;
    
    			length[r] = len + (p-k+1);
    			return r;
    		}
    
    		ext::pair<int,int> canonize(int s,int k,int p) { //p se může rovnat e
    			if(k>p)
    				return ext::make_pair ( s, k );
    
    			int sc,kc,pc;
    			wkEdge(s,w.at(k),sc,kc,pc);
    
    			if(pc == INT_MAX)
    				pc = e;
    
    			while(pc-kc <= p-k) {
    				k = k+pc-kc+1;
    				s = sc;
    				if(k<=p) {
    					wkEdge(s,w.at(k),sc,kc,pc);
    					if(pc == INT_MAX)
    						pc = e;
    				}
    			}
    			return ext::make_pair ( s, k );
    		}
    
    		void redirect_edge(int s, int k, int p, int r) {
    			int sc,kc,pc;
    			wkEdge(s,w.at(k),sc,kc,pc);
    
    			replaceTheEdgeByEdge(s,k,p,kc,pc,r);
    		}
    
    		int extension(int s, int k, int p) {
    			if(k > p)
    				return s;
    
    			int kc,pc,sc;
    			wkEdge(s,w.at(k),sc,kc,pc);
    			return sc;
    		}
    
    		ext::pair<int,int> separate_node(int s,int k,int p) { //p==e
    			int sc, kc;
    			std::tie ( sc, kc ) = canonize(s,k,p);
    			if ( kc <= p)
    				return ext::make_pair ( sc, kc );
    
    			int len = length.at(s);
    			if(len == INT_MAX)
    				len = e;
    
    			int lensc = length.at(sc);
    			if(lensc == INT_MAX)
    				lensc = e;
    
    			if(lensc == len+(p-k+1))
    				return ext::make_pair ( sc, kc );
    
    			int rc = duplicationOf(sc);
    			suf[rc] = suf.at(sc); suf[sc] = rc;
    			length[rc] = len +(p-k+1);
    			do {
    				replaceThewkEdge(s,k,p,rc);
    
    				std::tie ( s, k ) = canonize(suf.at(s),k,p-1);
    			} while ( ext::make_pair ( sc, kc ) == canonize(s,k,p));
    
    			return ext::make_pair ( rc, p+1 );
    		}
    
    		ext::pair<int,int> update(int s,int k,int p) {
    
    			const SymbolType & c = w.at(p);
    
    			int oldr = nil;
    
    			int sc = nil; // v článku není s' inicializované, ale je potřeba to na něco inicializovat, protože níže dochází k porovnávní
    
    			int r = -1; // not initialized in the paper. The first comparison of sc must fail and therefore variable r will neverbe read with its initial value
    
    			while(!check_end_point(s,k,p-1,c)) {
    				if(k<= p-1) {
    					if(sc == extension(s,k,p)) {
    						redirect_edge(s,k,p-1,r);
    						std::tie ( s, k ) = canonize(suf.at(s),k,p-1);
    						continue;
    					} else {
    						sc = extension(s,k,p);
    						r = split_edge(s,k,p-1);
    					}
    				} else
    					r = s;
    
    				createEdge(r,p,INT_MAX,sink); //p == e vždy. p je nahrazeno INT_MAX
    
    				if(oldr != nil)
    					suf[oldr] = r;
    				oldr = r;
    
    				std::tie ( s, k ) = canonize(suf.at(s),k,p-1);
    			}
    
    			if(oldr != nil)
    				suf[oldr] = s;
    			return separate_node(s,k,p);
    		}
    
    	public:
    		CompactSuffixAutomatonConstructInt ( const /*string::LinearString*/ext::vector < SymbolType > & subject ) : wLen ( subject.size ( ) ), endChar ( subject.back ( ) ) {
    			nil = INT_MIN;
    			newVertexNumber = -1;
    			ext::set < SymbolType > alphabet ( subject.begin ( ), subject.end ( ) );
    
    			for ( int i = 0; i < wLen; ++ i )
    				w.insert ( std::make_pair ( i + 1, subject [ i ] ) );
    
    			m = alphabet.size();
    
    			int i = -1;
    			for ( const SymbolType & symbol : alphabet )//abecedu je potřeba vyskládat na záporné indexy od -1 do -m
    				w.insert ( std::make_pair ( i --, symbol ) );
    		}
    
    		void startConstruction() {
    			T = createNode();
    			source = createNode();
    			sink = createNode();
    			for(int j = 1;j<=m;j++)
    				createEdge(T,-j,-j,source);
    			suf[source] = T;
    			length[source] = 0; length[T] = -1;
    			e = 0; length[sink] = INT_MAX;
    
    			int s = source;
    			int k = 1;
    			int i = 0;
    			do {
    				i = i+1; e = i;
    				std::tie ( s, k ) = update ( s, k, i );
    				//print ( i );
    			} while (w.at(i) != endChar);
    		}
    
    		void print ( int i ) const {
    			std::cout << "Krok " << i << std::endl;
    			for(int pi = 0;pi<newVertexNumber;pi++) {
    				std::cout << "Vrchol " << pi << std::endl;
    				for(ext::map<ext::pair<int,int>,int>::const_iterator it = edges.at(pi).begin(); it != edges.at(pi).end(); ++it) {
    					std::cout << it->first.first << " " << it->first.second;
    					std::cout << "	   -- ";
    					for(int j = it->first.first;j<=it->first.second;j++) {
    						if(j>i) break;
    						std::cout << w.find(j)->second;
    					}
    					std::cout << " --> " << it->second << std::endl;
    				}
    			}
    
    			std::cout << "-------------" << std::endl;
    
    			std::cout << "Delky" << std::endl;
    			for(int pi = 0;pi<newVertexNumber;pi++)
    				std::cout << "Vrchol " << pi << " " << length.find(pi)->second << std::endl;
    
    			std::cout << "Suffix links" << std::endl;
    			for(int pi = 0;pi<newVertexNumber;pi++)
    				std::cout << "Vrchol " << pi << " " << suf.find(pi)->second << std::endl;
    		}
    
    		void changeIntMaxToE() {
    			for(auto it = edges.begin();it!=edges.end();++it)
    				for(auto it2 = it->second.begin();it2!=it->second.end();++it2)
    					if(it2->first.second == INT_MAX) {
    						int k = it2->first.first;
    						int r = it2->second;
    
    						it->second.erase(it2);
    						it->second.insert({{k,e},r});
    						it2 = it->second.begin();
    					}
    		}
    
    		const ext::map<int,ext::map<ext::pair<int,int>,int>> & getEdges ( ) const {
    			return edges;
    		}
    	};
    
    public:
    	static indexes::stringology::CompactSuffixAutomatonTerminatingSymbol < DefaultSymbolType > construct ( const string::LinearStringTerminatingSymbol & subject ) {
    		CompactSuffixAutomatonConstructInt < DefaultSymbolType > algo (subject.getContent ( ) );
    		algo.startConstruction();
    		algo.changeIntMaxToE();
    		//algo.print ( subject.getContent().size ( ) );
    
    		indexes::stringology::CompactSuffixAutomatonTerminatingSymbol < DefaultSymbolType > res;
    		res.setString(subject.getContent ( ) );
    		res.setNumberOfVertices(algo.getEdges().size()-1);
    
    		for(auto it = algo.getEdges().begin();it!=algo.getEdges().end();++it)
    			if(it->first != -1)
    				res.insertVertex(it->first,it->second);
    
    		return res;
    	}
    
    
    	template < class SymbolType >
    	static indexes::stringology::CompactSuffixAutomatonTerminatingSymbol < SymbolType > construct ( const string::LinearString < SymbolType > & subject ) {
    		SymbolType endSymbol = common::createUnique ( alphabet::EndSymbol::instance < SymbolType > ( ), subject.getAlphabet ( ) );
    		ext::vector < SymbolType > content = subject.getContent ( );
    
    		content.push_back ( endSymbol );
    
    
    		CompactSuffixAutomatonConstructInt < SymbolType > algo ( content );
    		algo.startConstruction();
    		algo.changeIntMaxToE();
    		//algo.print ( subject.getContent().size ( ) );
    
    		indexes::stringology::CompactSuffixAutomatonTerminatingSymbol < SymbolType > res;
    		res.setString ( content );
    		res.setNumberOfVertices(algo.getEdges().size()-1);
    
    		for(auto it = algo.getEdges().begin();it!=algo.getEdges().end();++it)
    			if(it->first != -1)
    				res.insertVertex(it->first,it->second);
    
    		return res;
    
    	}
    };
    
    } /* namespace indexing */
    
    } /* namespace stringology */