From 556fbf723505dcf119f694cc2d58b27372fd4a4f Mon Sep 17 00:00:00 2001
From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz>
Date: Wed, 28 Nov 2018 11:30:38 +0100
Subject: [PATCH] make arithmetic frequency model templated

---
 .../compression/ArithmeticCompression.h       | 84 +++++++++++++++++++
 .../compression/ArithmeticDecompression.h     | 79 +++++++++++++++++
 .../stringology/compression/ArithmeticModel.h | 70 ++++++++++++++++
 .../compression/ArithmeticCompressionTest.cpp | 24 ++++++
 .../compression/ArithmeticCompressionTest.h   | 19 +++++
 5 files changed, 276 insertions(+)
 create mode 100644 alib2algo/src/stringology/compression/ArithmeticCompression.h
 create mode 100644 alib2algo/src/stringology/compression/ArithmeticDecompression.h
 create mode 100644 alib2algo/src/stringology/compression/ArithmeticModel.h
 create mode 100644 alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp
 create mode 100644 alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h

diff --git a/alib2algo/src/stringology/compression/ArithmeticCompression.h b/alib2algo/src/stringology/compression/ArithmeticCompression.h
new file mode 100644
index 0000000000..d42d7c563f
--- /dev/null
+++ b/alib2algo/src/stringology/compression/ArithmeticCompression.h
@@ -0,0 +1,84 @@
+#ifndef _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_
+#define _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_
+
+/**
+ * Implementation based on https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
+ */
+
+#include "ArithmeticModel.h"
+
+#include <alib/vector>
+#include <alib/string>
+
+namespace stringology {
+
+namespace compression {
+
+class AdaptiveIntegerArithmeticCompression {
+	inline static void put_bit_plus_pending ( ext::vector < bool > & result, bool bit, unsigned & pending_bits) {
+		result.push_back ( bit );
+		while ( pending_bits > 0 ) {
+			result.push_back ( ! bit );
+			-- pending_bits;
+		}
+	}
+public:
+
+	static ext::vector < bool > compress ( ext::string & source ) {
+		ext::set < char > alphabet;
+		for ( int i = 0; i < 256; ++ i )
+			alphabet.insert ( i );
+		ArithmeticModel < char > model ( alphabet );
+
+		ext::vector < bool > result;
+		unsigned pending_bits = 0;
+
+		unsigned max_code = ~0u;
+		unsigned one_half = ( max_code >> 1 ) + 1;
+		unsigned one_fourth = ( max_code >> 2 ) + 1;
+		unsigned three_fourths = one_half + one_fourth;
+
+		unsigned low = 0;
+		unsigned high = max_code;
+
+		for ( size_t index = 0; index < source.size ( ) + 1; ++ index ) {
+
+			unsigned prob_low, prob_high, prob_count;
+
+			if ( index >= source.size ( ) )
+				std::tie ( prob_low, prob_high, prob_count ) = model.getProbabilityEof ( );
+			else {
+				std::tie ( prob_low, prob_high, prob_count ) = model.getProbability ( source [ index ] );
+				model.update ( source [ index ] );
+			}
+
+			unsigned long long range = ( unsigned long long ) ( high - low ) + 1;
+			high = low + ( unsigned ) ( range * prob_high / prob_count - 1 );
+			low = low + ( unsigned ) ( range * prob_low / prob_count );
+			for ( ; ; ) {
+				if ( high < one_half || low >= one_half )
+					put_bit_plus_pending(result, low >= one_half, pending_bits);
+				else if ( low >= one_fourth && high < three_fourths ) {
+					pending_bits++;
+					low -= one_fourth;
+					high -= one_fourth;
+				} else
+					break;
+				high <<= 1;
+				high++;
+				low <<= 1;
+			}
+		}
+		pending_bits++;
+		put_bit_plus_pending(result, low >= one_fourth, pending_bits);
+
+		return result;
+	}
+
+};
+
+} /* namespace compression */
+
+} /* namespace stringology */
+
+#endif /* _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_ */
diff --git a/alib2algo/src/stringology/compression/ArithmeticDecompression.h b/alib2algo/src/stringology/compression/ArithmeticDecompression.h
new file mode 100644
index 0000000000..4deb12ce1e
--- /dev/null
+++ b/alib2algo/src/stringology/compression/ArithmeticDecompression.h
@@ -0,0 +1,79 @@
+#ifndef _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_
+#define _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_
+
+/**
+ * Implementation based on https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
+ */
+
+#include "ArithmeticModel.h"
+
+namespace stringology {
+
+namespace compression {
+
+class AdaptiveIntegerArithmeticDecompression {
+public:
+	static ext::string decompress ( ext::vector < bool > &source ) {
+		ext::set < char > alphabet;
+		for ( int i = 0; i < 256; ++ i )
+			alphabet.insert ( i );
+		ArithmeticModel < char > model ( alphabet );
+
+		ext::string result;
+
+		unsigned valid_bits = sizeof ( unsigned long long ) * 8 / 2;
+
+		unsigned long long max_code = ~0ull >> valid_bits;
+		unsigned long long one_half = ( max_code >> 1 ) + 1;
+		unsigned long long one_fourth = ( max_code >> 2 ) + 1;
+		unsigned long long three_fourths = one_half + one_fourth;
+
+		unsigned long long high = 0;
+		unsigned long long low = 0;
+		unsigned long long value = 0;
+
+		size_t index = 0;
+		for ( ; ; ) {
+			for( ; ; ) {
+				if ( high < one_half || low >= one_half ) {
+					//do nothing, both bits are a zero or both bits are one
+				} else if ( low >= one_fourth && high < three_fourths ) {
+					value -= one_fourth;
+					low -= one_fourth;
+					high -= one_fourth;
+				} else
+					break;
+				low <<= 1;
+				high <<= 1;
+				value <<= 1;
+
+				high++;
+				value += ( index >= source.size ( ) ) ? 0 : source [ index ++ ] ? 1 : 0;
+
+				low &= max_code;
+				high &= max_code;
+				value &= max_code;
+			}
+			unsigned long long range = high - low + 1;
+			unsigned scaled_value = ( ( value - low + 1 ) * model.getCount ( ) - 1 ) / range;
+			if ( model.isEof ( scaled_value ) )
+				break;
+
+			char c;
+			unsigned prob_low, prob_high, prob_count;
+			std::tie ( prob_low, prob_high, prob_count, c ) = model.getChar ( scaled_value );
+			model.update ( c );
+
+			result += c;
+			high = low + ( range * prob_high ) / prob_count - 1;
+			low = low + ( range * prob_low ) / prob_count;
+		}
+		return result;
+	}
+};
+
+} /* namespace compression */
+
+} /* namespace stringology */
+
+#endif /* _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_ */
diff --git a/alib2algo/src/stringology/compression/ArithmeticModel.h b/alib2algo/src/stringology/compression/ArithmeticModel.h
new file mode 100644
index 0000000000..036e23a40e
--- /dev/null
+++ b/alib2algo/src/stringology/compression/ArithmeticModel.h
@@ -0,0 +1,70 @@
+#ifndef ARITHMETIC_MODEL_H_
+#define ARITHMETIC_MODEL_H_
+
+#include <stdexcept>
+#include <alib/map>
+#include <alib/variant>
+#include <alib/set>
+
+template < class SymbolType >
+class ArithmeticModel {
+	ext::map < ext::variant < void, SymbolType >, unsigned > m_low_cumulative_frequency;
+	unsigned m_global_high;
+
+public:
+	ArithmeticModel ( const ext::set < SymbolType > & alphabet ) {
+		m_low_cumulative_frequency.insert ( std::make_pair ( ext::variant < void, SymbolType >::template from < void > ( ), 0 ) );
+		for ( const SymbolType & symbol : alphabet )
+			m_low_cumulative_frequency.insert ( std::make_pair ( symbol, 0 ) );
+
+		unsigned frequency = 0;
+		for ( std::pair < const ext::variant < void, SymbolType >, unsigned > & entry : m_low_cumulative_frequency )
+			entry.second = frequency ++;
+
+		m_global_high = frequency;
+	}
+
+	void update ( const ext::variant < void, SymbolType > & symbol ) {
+		for ( auto i = std::next ( m_low_cumulative_frequency.find ( symbol ) ); i != m_low_cumulative_frequency.end ( ) ; ++ i )
+			i->second += 1;
+		m_global_high += 1;
+	}
+
+	std::tuple < unsigned, unsigned, unsigned > getProbability ( const ext::variant < void, SymbolType > & c ) const {
+		auto i = m_low_cumulative_frequency.find ( c );
+		unsigned high_prob = m_global_high;
+		if ( std::next ( i ) != m_low_cumulative_frequency.end ( ) ) {
+			high_prob = std::next ( i )->second;
+		}
+		return std::make_tuple ( i->second, high_prob, m_global_high );
+	}
+
+	std::tuple < unsigned, unsigned, unsigned > getProbabilityEof ( ) const {
+		return getProbability ( ext::variant < void, SymbolType >::template from < void > ( ) );
+	}
+
+	std::tuple < unsigned, unsigned, unsigned, SymbolType > getChar ( unsigned scaled_value ) const {
+		for ( auto i = m_low_cumulative_frequency.begin ( ); std::next ( i ) != m_low_cumulative_frequency.end ( ); ++ i )
+			if ( scaled_value < std::next ( i )->second ) {
+				unsigned high_prob = m_global_high;
+				if ( std::next ( i ) != m_low_cumulative_frequency.end ( ) ) {
+					high_prob = std::next ( i )->second;
+				}
+				return std::make_tuple ( i->second, high_prob, m_global_high, i->first.template get < SymbolType > ( ) );
+			}
+		throw std::logic_error("error");
+	}
+
+	bool isEof ( unsigned scaled_value ) const {
+		unsigned prob_low, prob_high, prob_count;
+		std::tie ( prob_low, prob_high, prob_count ) = getProbabilityEof ( );
+		return scaled_value >= prob_low && scaled_value < prob_high;
+	}
+
+	unsigned getCount ( ) const {
+		return m_global_high;
+	}
+
+};
+
+#endif //#ifndef ARITHMETIC_MODEL_H_
diff --git a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp
new file mode 100644
index 0000000000..869eb81332
--- /dev/null
+++ b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp
@@ -0,0 +1,24 @@
+#include "ArithmeticCompressionTest.h"
+
+#include <stringology/compression/ArithmeticCompression.h>
+#include <stringology/compression/ArithmeticDecompression.h>
+
+CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( ArithmeticCompressionTest, "compression" );
+CPPUNIT_TEST_SUITE_REGISTRATION( ArithmeticCompressionTest );
+
+void ArithmeticCompressionTest::setUp() {
+}
+
+void ArithmeticCompressionTest::tearDown() {
+}
+
+void ArithmeticCompressionTest::basics() {
+	ext::string input ( "abbabbabaae2378 8723 babababb  ab bapobababbbabaaabbafjfjdjlvldsuiueqwpomvdhgataewpvdihviasubababbba 5475 baaabba" );
+	ext::vector < bool > compressed = stringology::compression::AdaptiveIntegerArithmeticCompression::compress ( input );
+	std::cout << "compressed = " << compressed << std::endl;
+	ext::string output = stringology::compression::AdaptiveIntegerArithmeticDecompression::decompress ( compressed );
+
+	std::cout << "original= " << input << " decompressed = " << output << std::endl;
+	CPPUNIT_ASSERT ( input == output );
+}
+
diff --git a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h
new file mode 100644
index 0000000000..13eecedb83
--- /dev/null
+++ b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h
@@ -0,0 +1,19 @@
+#ifndef _ARITHMETIC_COMPRESSION_TEST_H_
+#define _ARITHMETIC_COMPRESSION_TEST_H_
+
+#include <cppunit/extensions/HelperMacros.h>
+
+class ArithmeticCompressionTest : public CppUnit::TestFixture
+{
+  CPPUNIT_TEST_SUITE( ArithmeticCompressionTest );
+  CPPUNIT_TEST( basics );
+  CPPUNIT_TEST_SUITE_END();
+
+public:
+  void setUp();
+  void tearDown();
+
+  void basics();
+};
+
+#endif  // _ARITHMETIC_COMPRESSION_TEST_H_
-- 
GitLab