From 556fbf723505dcf119f694cc2d58b27372fd4a4f Mon Sep 17 00:00:00 2001 From: Jan Travnicek <Jan.Travnicek@fit.cvut.cz> Date: Wed, 28 Nov 2018 11:30:38 +0100 Subject: [PATCH] make arithmetic frequency model templated --- .../compression/ArithmeticCompression.h | 84 +++++++++++++++++++ .../compression/ArithmeticDecompression.h | 79 +++++++++++++++++ .../stringology/compression/ArithmeticModel.h | 70 ++++++++++++++++ .../compression/ArithmeticCompressionTest.cpp | 24 ++++++ .../compression/ArithmeticCompressionTest.h | 19 +++++ 5 files changed, 276 insertions(+) create mode 100644 alib2algo/src/stringology/compression/ArithmeticCompression.h create mode 100644 alib2algo/src/stringology/compression/ArithmeticDecompression.h create mode 100644 alib2algo/src/stringology/compression/ArithmeticModel.h create mode 100644 alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp create mode 100644 alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h diff --git a/alib2algo/src/stringology/compression/ArithmeticCompression.h b/alib2algo/src/stringology/compression/ArithmeticCompression.h new file mode 100644 index 0000000000..d42d7c563f --- /dev/null +++ b/alib2algo/src/stringology/compression/ArithmeticCompression.h @@ -0,0 +1,84 @@ +#ifndef _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_ +#define _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_ + +/** + * Implementation based on https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html + */ + +#include "ArithmeticModel.h" + +#include <alib/vector> +#include <alib/string> + +namespace stringology { + +namespace compression { + +class AdaptiveIntegerArithmeticCompression { + inline static void put_bit_plus_pending ( ext::vector < bool > & result, bool bit, unsigned & pending_bits) { + result.push_back ( bit ); + while ( pending_bits > 0 ) { + result.push_back ( ! bit ); + -- pending_bits; + } + } +public: + + static ext::vector < bool > compress ( ext::string & source ) { + ext::set < char > alphabet; + for ( int i = 0; i < 256; ++ i ) + alphabet.insert ( i ); + ArithmeticModel < char > model ( alphabet ); + + ext::vector < bool > result; + unsigned pending_bits = 0; + + unsigned max_code = ~0u; + unsigned one_half = ( max_code >> 1 ) + 1; + unsigned one_fourth = ( max_code >> 2 ) + 1; + unsigned three_fourths = one_half + one_fourth; + + unsigned low = 0; + unsigned high = max_code; + + for ( size_t index = 0; index < source.size ( ) + 1; ++ index ) { + + unsigned prob_low, prob_high, prob_count; + + if ( index >= source.size ( ) ) + std::tie ( prob_low, prob_high, prob_count ) = model.getProbabilityEof ( ); + else { + std::tie ( prob_low, prob_high, prob_count ) = model.getProbability ( source [ index ] ); + model.update ( source [ index ] ); + } + + unsigned long long range = ( unsigned long long ) ( high - low ) + 1; + high = low + ( unsigned ) ( range * prob_high / prob_count - 1 ); + low = low + ( unsigned ) ( range * prob_low / prob_count ); + for ( ; ; ) { + if ( high < one_half || low >= one_half ) + put_bit_plus_pending(result, low >= one_half, pending_bits); + else if ( low >= one_fourth && high < three_fourths ) { + pending_bits++; + low -= one_fourth; + high -= one_fourth; + } else + break; + high <<= 1; + high++; + low <<= 1; + } + } + pending_bits++; + put_bit_plus_pending(result, low >= one_fourth, pending_bits); + + return result; + } + +}; + +} /* namespace compression */ + +} /* namespace stringology */ + +#endif /* _COMPRESSION_ADAPTIVE_ARITHMETIC_COMPRESSION_INTEGER_H_ */ diff --git a/alib2algo/src/stringology/compression/ArithmeticDecompression.h b/alib2algo/src/stringology/compression/ArithmeticDecompression.h new file mode 100644 index 0000000000..4deb12ce1e --- /dev/null +++ b/alib2algo/src/stringology/compression/ArithmeticDecompression.h @@ -0,0 +1,79 @@ +#ifndef _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_ +#define _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_ + +/** + * Implementation based on https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html + */ + +#include "ArithmeticModel.h" + +namespace stringology { + +namespace compression { + +class AdaptiveIntegerArithmeticDecompression { +public: + static ext::string decompress ( ext::vector < bool > &source ) { + ext::set < char > alphabet; + for ( int i = 0; i < 256; ++ i ) + alphabet.insert ( i ); + ArithmeticModel < char > model ( alphabet ); + + ext::string result; + + unsigned valid_bits = sizeof ( unsigned long long ) * 8 / 2; + + unsigned long long max_code = ~0ull >> valid_bits; + unsigned long long one_half = ( max_code >> 1 ) + 1; + unsigned long long one_fourth = ( max_code >> 2 ) + 1; + unsigned long long three_fourths = one_half + one_fourth; + + unsigned long long high = 0; + unsigned long long low = 0; + unsigned long long value = 0; + + size_t index = 0; + for ( ; ; ) { + for( ; ; ) { + if ( high < one_half || low >= one_half ) { + //do nothing, both bits are a zero or both bits are one + } else if ( low >= one_fourth && high < three_fourths ) { + value -= one_fourth; + low -= one_fourth; + high -= one_fourth; + } else + break; + low <<= 1; + high <<= 1; + value <<= 1; + + high++; + value += ( index >= source.size ( ) ) ? 0 : source [ index ++ ] ? 1 : 0; + + low &= max_code; + high &= max_code; + value &= max_code; + } + unsigned long long range = high - low + 1; + unsigned scaled_value = ( ( value - low + 1 ) * model.getCount ( ) - 1 ) / range; + if ( model.isEof ( scaled_value ) ) + break; + + char c; + unsigned prob_low, prob_high, prob_count; + std::tie ( prob_low, prob_high, prob_count, c ) = model.getChar ( scaled_value ); + model.update ( c ); + + result += c; + high = low + ( range * prob_high ) / prob_count - 1; + low = low + ( range * prob_low ) / prob_count; + } + return result; + } +}; + +} /* namespace compression */ + +} /* namespace stringology */ + +#endif /* _COMPRESSION_ADAPTIVE_ARITHMETIC_DECOMPRESSION_INTEGER_H_ */ diff --git a/alib2algo/src/stringology/compression/ArithmeticModel.h b/alib2algo/src/stringology/compression/ArithmeticModel.h new file mode 100644 index 0000000000..036e23a40e --- /dev/null +++ b/alib2algo/src/stringology/compression/ArithmeticModel.h @@ -0,0 +1,70 @@ +#ifndef ARITHMETIC_MODEL_H_ +#define ARITHMETIC_MODEL_H_ + +#include <stdexcept> +#include <alib/map> +#include <alib/variant> +#include <alib/set> + +template < class SymbolType > +class ArithmeticModel { + ext::map < ext::variant < void, SymbolType >, unsigned > m_low_cumulative_frequency; + unsigned m_global_high; + +public: + ArithmeticModel ( const ext::set < SymbolType > & alphabet ) { + m_low_cumulative_frequency.insert ( std::make_pair ( ext::variant < void, SymbolType >::template from < void > ( ), 0 ) ); + for ( const SymbolType & symbol : alphabet ) + m_low_cumulative_frequency.insert ( std::make_pair ( symbol, 0 ) ); + + unsigned frequency = 0; + for ( std::pair < const ext::variant < void, SymbolType >, unsigned > & entry : m_low_cumulative_frequency ) + entry.second = frequency ++; + + m_global_high = frequency; + } + + void update ( const ext::variant < void, SymbolType > & symbol ) { + for ( auto i = std::next ( m_low_cumulative_frequency.find ( symbol ) ); i != m_low_cumulative_frequency.end ( ) ; ++ i ) + i->second += 1; + m_global_high += 1; + } + + std::tuple < unsigned, unsigned, unsigned > getProbability ( const ext::variant < void, SymbolType > & c ) const { + auto i = m_low_cumulative_frequency.find ( c ); + unsigned high_prob = m_global_high; + if ( std::next ( i ) != m_low_cumulative_frequency.end ( ) ) { + high_prob = std::next ( i )->second; + } + return std::make_tuple ( i->second, high_prob, m_global_high ); + } + + std::tuple < unsigned, unsigned, unsigned > getProbabilityEof ( ) const { + return getProbability ( ext::variant < void, SymbolType >::template from < void > ( ) ); + } + + std::tuple < unsigned, unsigned, unsigned, SymbolType > getChar ( unsigned scaled_value ) const { + for ( auto i = m_low_cumulative_frequency.begin ( ); std::next ( i ) != m_low_cumulative_frequency.end ( ); ++ i ) + if ( scaled_value < std::next ( i )->second ) { + unsigned high_prob = m_global_high; + if ( std::next ( i ) != m_low_cumulative_frequency.end ( ) ) { + high_prob = std::next ( i )->second; + } + return std::make_tuple ( i->second, high_prob, m_global_high, i->first.template get < SymbolType > ( ) ); + } + throw std::logic_error("error"); + } + + bool isEof ( unsigned scaled_value ) const { + unsigned prob_low, prob_high, prob_count; + std::tie ( prob_low, prob_high, prob_count ) = getProbabilityEof ( ); + return scaled_value >= prob_low && scaled_value < prob_high; + } + + unsigned getCount ( ) const { + return m_global_high; + } + +}; + +#endif //#ifndef ARITHMETIC_MODEL_H_ diff --git a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp new file mode 100644 index 0000000000..869eb81332 --- /dev/null +++ b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp @@ -0,0 +1,24 @@ +#include "ArithmeticCompressionTest.h" + +#include <stringology/compression/ArithmeticCompression.h> +#include <stringology/compression/ArithmeticDecompression.h> + +CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( ArithmeticCompressionTest, "compression" ); +CPPUNIT_TEST_SUITE_REGISTRATION( ArithmeticCompressionTest ); + +void ArithmeticCompressionTest::setUp() { +} + +void ArithmeticCompressionTest::tearDown() { +} + +void ArithmeticCompressionTest::basics() { + ext::string input ( "abbabbabaae2378 8723 babababb ab bapobababbbabaaabbafjfjdjlvldsuiueqwpomvdhgataewpvdihviasubababbba 5475 baaabba" ); + ext::vector < bool > compressed = stringology::compression::AdaptiveIntegerArithmeticCompression::compress ( input ); + std::cout << "compressed = " << compressed << std::endl; + ext::string output = stringology::compression::AdaptiveIntegerArithmeticDecompression::decompress ( compressed ); + + std::cout << "original= " << input << " decompressed = " << output << std::endl; + CPPUNIT_ASSERT ( input == output ); +} + diff --git a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h new file mode 100644 index 0000000000..13eecedb83 --- /dev/null +++ b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.h @@ -0,0 +1,19 @@ +#ifndef _ARITHMETIC_COMPRESSION_TEST_H_ +#define _ARITHMETIC_COMPRESSION_TEST_H_ + +#include <cppunit/extensions/HelperMacros.h> + +class ArithmeticCompressionTest : public CppUnit::TestFixture +{ + CPPUNIT_TEST_SUITE( ArithmeticCompressionTest ); + CPPUNIT_TEST( basics ); + CPPUNIT_TEST_SUITE_END(); + +public: + void setUp(); + void tearDown(); + + void basics(); +}; + +#endif // _ARITHMETIC_COMPRESSION_TEST_H_ -- GitLab