diff --git a/alib2algo/src/stringology/compression/ArithmeticCompression.h b/alib2algo/src/stringology/compression/ArithmeticCompression.h index 41e277631f1a984bff68da32241fba7bed4bf7c5..04c98a80623482b673d16d558d1a4d6abb1093ca 100644 --- a/alib2algo/src/stringology/compression/ArithmeticCompression.h +++ b/alib2algo/src/stringology/compression/ArithmeticCompression.h @@ -8,7 +8,8 @@ #include "ArithmeticModel.h" #include <alib/vector> -#include <alib/string> + +#include <string/LinearString.h> namespace stringology { @@ -23,39 +24,37 @@ class AdaptiveIntegerArithmeticCompression { } } public: - - static ext::vector < bool > compress ( ext::string & source ) { - ext::set < char > alphabet; - for ( int i = 0; i < 256; ++ i ) - alphabet.insert ( i ); - ArithmeticModel < char > model ( alphabet ); + template < class SymbolType > + static ext::vector < bool > compress ( const string::LinearString < SymbolType > & source ) { + ArithmeticModel < SymbolType > model ( source.getAlphabet ( ) ); ext::vector < bool > result; unsigned pending_bits = 0; + unsigned valid_bits = sizeof ( unsigned long long ) * 8 / 2; - unsigned max_code = ~0u; - unsigned one_half = ( max_code >> 1 ) + 1; - unsigned one_fourth = ( max_code >> 2 ) + 1; - unsigned three_fourths = one_half + one_fourth; + unsigned long long max_code = ~0ull >> valid_bits; + unsigned long long one_half = ( max_code >> 1 ) + 1; + unsigned long long one_fourth = ( max_code >> 2 ) + 1; + unsigned long long three_fourths = one_half + one_fourth; - unsigned low = 0; - unsigned high = max_code; + unsigned long long low = 0; + unsigned long long high = max_code; - for ( size_t index = 0; index < source.size ( ) + 1; ++ index ) { + for ( size_t index = 0; index < source.getContent ( ).size ( ) + 1; ++ index ) { unsigned prob_low, prob_high; unsigned prob_count = model.getCount ( ); - if ( index >= source.size ( ) ) - std::tie ( prob_low, prob_high ) = model.getProbabilityEof ( ); + if ( index >= source.getContent ( ).size ( ) ) + model.getProbabilityEof ( prob_low, prob_high ); else { - std::tie ( prob_low, prob_high ) = model.getProbability ( source [ index ] ); - model.update ( source [ index ] ); + model.getProbability ( source.getContent ( ) [ index ], prob_low, prob_high ); + model.update ( source.getContent ( ) [ index ] ); } - unsigned long long range = ( unsigned long long ) ( high - low ) + 1; - high = low + ( unsigned ) ( range * prob_high / prob_count - 1 ); - low = low + ( unsigned ) ( range * prob_low / prob_count ); + unsigned long long range = high - low + 1; + high = low + range * prob_high / prob_count - 1; + low = low + range * prob_low / prob_count; for ( ; ; ) { if ( high < one_half || low >= one_half ) put_bit_plus_pending(result, low >= one_half, pending_bits); @@ -68,6 +67,9 @@ public: high <<= 1; high++; low <<= 1; + + low &= max_code; + high &= max_code; } } pending_bits++; diff --git a/alib2algo/src/stringology/compression/ArithmeticDecompression.h b/alib2algo/src/stringology/compression/ArithmeticDecompression.h index 9a63d3cf30a9d424b66624c2b8152ddeabd632b4..27d90ae62aa7bd773d93808814c1974ac532b93c 100644 --- a/alib2algo/src/stringology/compression/ArithmeticDecompression.h +++ b/alib2algo/src/stringology/compression/ArithmeticDecompression.h @@ -7,19 +7,18 @@ #include "ArithmeticModel.h" +#include <string/LinearString.h> + namespace stringology { namespace compression { class AdaptiveIntegerArithmeticDecompression { public: - static ext::string decompress ( ext::vector < bool > &source ) { - ext::set < char > alphabet; - for ( int i = 0; i < 256; ++ i ) - alphabet.insert ( i ); - ArithmeticModel < char > model ( alphabet ); - - ext::string result; + template < class SymbolType > + static string::LinearString < SymbolType > decompress ( const ext::vector < bool > & source, const ext::set < SymbolType > & alphabet ) { + ArithmeticModel < SymbolType > model ( alphabet ); + ext::vector < SymbolType > result; unsigned valid_bits = sizeof ( unsigned long long ) * 8 / 2; @@ -59,17 +58,17 @@ public: if ( model.isEof ( scaled_value ) ) break; - char c; unsigned prob_low, prob_high; unsigned prob_count = model.getCount ( ); - std::tie ( prob_low, prob_high, c ) = model.getChar ( scaled_value ); + SymbolType c = model.getChar ( scaled_value, prob_low, prob_high ); model.update ( c ); - result += c; - high = low + ( range * prob_high ) / prob_count - 1; - low = low + ( range * prob_low ) / prob_count; + result.push_back ( c ); + high = low + range * prob_high / prob_count - 1; + low = low + range * prob_low / prob_count; } - return result; + + return string::LinearString < SymbolType > ( alphabet, result ); } }; diff --git a/alib2algo/src/stringology/compression/ArithmeticModel.h b/alib2algo/src/stringology/compression/ArithmeticModel.h index 9586b74f8a120096bb51e84f816df701661b3615..298263fc451a825e4085967ce88c03bc3b412ff1 100644 --- a/alib2algo/src/stringology/compression/ArithmeticModel.h +++ b/alib2algo/src/stringology/compression/ArithmeticModel.h @@ -3,7 +3,6 @@ #include <stdexcept> #include <alib/map> -#include <alib/variant> #include <alib/set> template < class SymbolType > @@ -19,35 +18,36 @@ public: m_global_high = frequency + 1; } - void update ( const ext::variant < void, SymbolType > & symbol ) { + void update ( const SymbolType & symbol ) { for ( auto i = m_high_cumulative_frequency.find ( symbol ); i != m_high_cumulative_frequency.end ( ) ; ++ i ) i->second += 1; m_global_high += 1; } - std::tuple < unsigned, unsigned > getProbability ( const SymbolType & c ) const { + void getProbability ( const SymbolType & c, unsigned & low_prob, unsigned & high_prob ) const { auto i = m_high_cumulative_frequency.find ( c ); - unsigned low_prob = 0; + high_prob = i->second; + low_prob = 0; if ( i != m_high_cumulative_frequency.begin ( ) ) low_prob = std::prev ( i )->second; - - return std::make_tuple ( low_prob, i->second ); } - std::tuple < unsigned, unsigned > getProbabilityEof ( ) const { - return std::make_tuple ( m_global_high - 1, m_global_high ); + void getProbabilityEof ( unsigned & low_prob, unsigned & high_prob ) const { + low_prob = m_global_high - 1; + high_prob = m_global_high; } - std::tuple < unsigned, unsigned, SymbolType > getChar ( unsigned scaled_value ) const { + SymbolType getChar ( unsigned scaled_value, unsigned & low_prob, unsigned & high_prob ) const { for ( auto i = m_high_cumulative_frequency.begin ( ); i != m_high_cumulative_frequency.end ( ); ++ i ) if ( scaled_value < i->second ) { - unsigned low_prob = 0; + high_prob = i->second; + low_prob = 0; if ( i != m_high_cumulative_frequency.begin ( ) ) low_prob = std::prev ( i )->second; - return std::make_tuple ( low_prob, i->second, i->first ); + return i->first; } throw std::logic_error("error"); } diff --git a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp index 869eb81332519238b62905adb8e8f80746891897..f90e719a4068fabfb92a9f72439b56dacd560570 100644 --- a/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp +++ b/alib2algo/test-src/stringology/compression/ArithmeticCompressionTest.cpp @@ -13,12 +13,15 @@ void ArithmeticCompressionTest::tearDown() { } void ArithmeticCompressionTest::basics() { - ext::string input ( "abbabbabaae2378 8723 babababb ab bapobababbbabaaabbafjfjdjlvldsuiueqwpomvdhgataewpvdihviasubababbba 5475 baaabba" ); + ext::string rawInput ( "abbabbabaae123456789r0 8723 babababb ab bapobababbbabaaabbafjfjdjlvldsuiueqwpomvdhgataewpvdihviasubababbba 5475 baaabba" ); + string::LinearString < char > input ( rawInput ); + ext::vector < bool > compressed = stringology::compression::AdaptiveIntegerArithmeticCompression::compress ( input ); std::cout << "compressed = " << compressed << std::endl; - ext::string output = stringology::compression::AdaptiveIntegerArithmeticDecompression::decompress ( compressed ); + string::LinearString < char > output = stringology::compression::AdaptiveIntegerArithmeticDecompression::decompress ( compressed, input.getAlphabet ( ) ); - std::cout << "original= " << input << " decompressed = " << output << std::endl; + std::cout << "original= " << input << std::endl << "decompressed = " << output << std::endl; + std::cout << "compressed size = " << compressed.size ( ) << std::endl << "original_size = " << input.getContent ( ).size ( ) * 8 << std::endl; CPPUNIT_ASSERT ( input == output ); }