Increment gone wrong

**awsdert** · 07-24-2021

Chips are about to burn so I'll edit this after taking them out of the oven, short of it is that on the first incrment code should be 1, not 8

Code:

			for ( code = 0, cur_leng = 1; cur_leng <= max_leng; ++cur_leng )
			{
				for ( uint j = 0; j < list.used; ++j )
				{
					CODEWORD *word = _list + j;

					printf( "Code = " );
					SeeBits( &code, max_leng );
					putchar('\n');

					if ( word->leng != cur_leng )
						continue;

					word->code = code;

					printf( "_list[%u].code = ", j );
					SeeBits( &code, word->leng );
					putchar('\n');

					++code;
				}

				printf( "Final code for length %u = ", cur_leng );
				SeeBits( &code, max_leng );
				putchar('\n');

				while ( !(code >> cur_leng) )
				{
					uint value = code >> cur_leng;
					printf( "Code " );
					SeeBits( &code, max_leng );
					printf( " shifted by %u results in ", cur_leng );
					SeeBits( &value, max_leng );
					putchar('\n');

					++code;
				}

				printf( "Initial expected code for leng %u = ", cur_leng + 1 );
				SeeBits( &code, cur_leng + 1 );
				putchar('\n');

				for ( i = 0; i < list.used; ++i )
				{
					CODEWORD *word = _list + i;

					if ( word->leng >= cur_leng )
						continue;

					if ( code == (word->code << (cur_leng - word->code)) )
						code++;
				}

				printf( "Adjusted initial code for leng %u = ", cur_leng + 1 );
				SeeBits( &code, cur_leng + 1 );
				putchar('\n');
			}

Code:

./a.out aba.gz
PrintBytes( 0x559c1662c480, 41, 16 )
1F 8B  0  0  0  0  0  0  0  3 1D C6 49  1  0  0
40 C0 AC A3 7F 88 3D 3C 20 2A 97 9D 37 5E 1D  C
29 34 94 23  0  0  0
path = 'aba.gz', max readable bits = 328
gzip.magic = 1F8B, gzip.flags =  0, gzip.mtime =        0, zlib.xflags =        0, zlib.system =        0
gzip.flag_TEXT = false
gzip.flag_HCRC = false
gzip.flag_MORE = false
gzip.flag_NAME = false
gzip.flag_NOTE = false
gzip.flag_RESERVED = 0
last = true, type = 2
lengc = 260, distc = 7, codec = 18, left = 7
 0 _list[16].leng = 4, max_leng 4
 1 _list[17].leng = 4, max_leng 4
 2 _list[18].leng = 2, max_leng 4
 3 _list[ 0].leng = 0, max_leng 4
 4 _list[ 8].leng = 0, max_leng 4
 5 _list[ 7].leng = 0, max_leng 4
 6 _list[ 9].leng = 0, max_leng 4
 7 _list[ 6].leng = 0, max_leng 4
 8 _list[10].leng = 0, max_leng 4
 9 _list[ 5].leng = 0, max_leng 4
10 _list[11].leng = 0, max_leng 4
11 _list[ 4].leng = 4, max_leng 4
12 _list[12].leng = 0, max_leng 4
13 _list[ 3].leng = 0, max_leng 4
14 _list[13].leng = 0, max_leng 4
15 _list[ 2].leng = 1, max_leng 4
16 _list[14].leng = 0, max_leng 4
17 _list[ 1].leng = 4, max_leng 4
i = 18
_list[ 0] = _list[ 1], code = 0, leng = 4
_list[ 1] = _list[ 2], code = 0, leng = 1
_list[ 2] = _list[ 4], code = 0, leng = 4
_list[ 3] = _list[16], code = 0, leng = 4
_list[ 4] = _list[17], code = 0, leng = 4
_list[ 5] = _list[18], code = 0, leng = 2
Code = 0000
Code = 0000
_list[1].code = 0
Code = 1000
Code = 1000
Code = 1000
Code = 1000
Final code for length 1 = 1000
Code 1000 shifted by 1 results in 0000
Initial expected code for leng 2 = 01
Adjusted initial code for leng 2 = 01
Code = 0100
Code = 0100
Code = 0100
Code = 0100
Code = 0100
Code = 0100
_list[5].code = 01
Final code for length 2 = 1100
Code 1100 shifted by 2 results in 0000
Initial expected code for leng 3 = 001
Adjusted initial code for leng 3 = 001
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Final code for length 3 = 0010
Code 0010 shifted by 3 results in 0000
Code 1010 shifted by 3 results in 0000
Code 0110 shifted by 3 results in 0000
Code 1110 shifted by 3 results in 0000
Initial expected code for leng 4 = 0001
Adjusted initial code for leng 4 = 0001
Code = 0001
_list[0].code = 0001
Code = 1001
Code = 1001
_list[2].code = 1001
Code = 0101
_list[3].code = 0101
Code = 1101
_list[4].code = 1101
Code = 0011
Final code for length 4 = 0011
Code 0011 shifted by 4 results in 0000
Code 1011 shifted by 4 results in 0000
Code 0111 shifted by 4 results in 0000
Code 1111 shifted by 4 results in 0000
Initial expected code for leng 5 = 00001
Adjusted initial code for leng 5 = 00001
...

Edit: Now that my chips are safe, the most relevant output is from this part:

Code:

Code = 0000
Code = 0000
_list[1].code = 0
Code = 1000
Code = 1000
Code = 1000
Code = 1000
Final code for length 1 = 1000
Code 1000 shifted by 1 results in 0000
Initial expected code for leng 2 = 01
Adjusted initial code for leng 2 = 01
Code = 0100
Code = 0100
Code = 0100
Code = 0100
Code = 0100
Code = 0100
_list[5].code = 01
Final code for length 2 = 1100
Code 1100 shifted by 2 results in 0000
Initial expected code for leng 3 = 001
Adjusted initial code for leng 3 = 001
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Code = 0010
Final code for length 3 = 0010
Code 0010 shifted by 3 results in 0000
Code 1010 shifted by 3 results in 0000
Code 0110 shifted by 3 results in 0000
Code 1110 shifted by 3 results in 0000
Initial expected code for leng 4 = 0001
Adjusted initial code for leng 4 = 0001
Code = 0001
_list[0].code = 0001
Code = 1001
Code = 1001
_list[2].code = 1001
Code = 0101
_list[3].code = 0101
Code = 1101
_list[4].code = 1101
Code = 0011
Final code for length 4 = 0011
Code 0011 shifted by 4 results in 0000
Code 1011 shifted by 4 results in 0000
Code 0111 shifted by 4 results in 0000
Code 1111 shifted by 4 results in 0000
Initial expected code for leng 5 = 00001
Adjusted initial code for leng 5 = 00001
...

The rest I provided just for context, code is an unsigned integer so there's no funny pointer business going on, there's no opotunity to increment code,SeeBits() look like this:

Code:

void SeeBits( void * data, uint bits )
{
	for ( uint i = 0; i < bits; i += bitsof(uintmax_t) )
	{
		uint left = bits - (i * bitsof(uintmax_t));
		uint stop = (left < bitsof(uintmax_t)) ? left : bitsof(uintmax_t);
		uintmax_t mask = 1;
		uintmax_t value = GetBits
		(
			data,
			bits,
			i * bitsof(uintmax_t),
			stop,
			false,
			false
		);

		for ( mask <<= (stop - 1); mask; mask >>= 1 )
			putchar( '0' + !!(value & mask) );
	}
}

And GetBits() looks like this:

Code:

uintmax_t GetBits( void * data, uint max, uint pos, uint bits, bool backwards, bool left_shifted )
{
	uchar *temp = data;
	uint end, i = 0;
	uintmax_t value = 0;

	if ( backwards )
	{
		end = (bits <= pos) ? pos - bits : 0;

		while ( pos > end )
		{
			uint B = --pos / CHAR_BIT;
			uint b = pos % CHAR_BIT;
			uchar bit = 1 << b;

			value <<= 1;
			value |= !!(temp[B] & bit);
			++i;
		}
	}
	else
	{
		end = pos + bits;
		end = (end <= max) ? end : max;

		while ( pos < end )
		{
			uint B = pos / CHAR_BIT;
			uint b = pos++ % CHAR_BIT;
			uchar bit = 1 << b;
			bool one = !!(temp[B] & bit);

			value <<= 1;
			value |= one;
			++i;
		}
	}

	if ( left_shifted )
		value <<= bits - i;

	return  value;
}

I see no chance for the value of "code" to be corrupted, any ideas?

**Salem** · 07-24-2021

1. Make your SeeBits / GetBits functions take const void * - just because.

2. Put a watchpoint on the variable which is troubling you.
Debugging – Types Of Data Breakpoints In GDB – mohit.io

**awsdert** · 07-24-2021

Turned out I had somehow flipped them in GetBits so SeeBits only saw the flipped version, I'll have to fix that later but for now I'll focus on the deflate algrotihm

**Salem** · 07-24-2021

Well GetBits is a bloated kitchen-sink function trying to do too much.

bool backwards, bool left_shifted are needless complications.

**awsdert** · 07-24-2021

For my scenario, they're not needless, I have used those values for various things, particularly quick checks on which way I should be reading the bits I'm on.

**hamster_nz** · 07-25-2021

Are you decoding PNG 'deflate' compression by any chance?

Inspired by your earlier posts I decided that I would attempt this too, and am making good progress.

Shame the RFC is sort of mostly backwards (LSB to the left, MSB to the right). But luckily I am dyslexic, making it no more bother than normal.

**awsdert** · 07-25-2021

Yes I will be doing that, finding this resource extremely helpful in that aspect, also lead me to find a bug in gzip where it sets the NAME flag incorrectly when giving shell input instead of a file

Understanding gzip

**hamster_nz** · 07-25-2021

I dpn't know if it is of any value, but here is my prototype for getting and consuming bits from the compressed bitstream.

Code:

static uint32_t grab_bits(struct Inflate_data *d) {
   uint32_t bits=0;
   // Take the next 24 bits out of the data buffer
   if(d->byte+0 < d->len) bits |= d->data[d->byte+0];
   if(d->byte+1 < d->len) bits |= d->data[d->byte+1] << 8;
   if(d->byte+2 < d->len) bits |= d->data[d->byte+2] << 16;
   if(d->bit == 0) return bits; // Don't need the 4th byte
   if(d->byte+3 < d->len) bits |= d->data[d->byte+3] << 24;
   return bits >> d->bit;
}


int consume_bits(struct Inflate_data *d, uint8_t count) {
   d->bit  += count;
   d->byte += d->bit>>3;
   d->bit  &= 0x7;
   if(verbose)
     printf("Consuming %i bits, now %d:%d\n",count, d->byte,d->bit);
   return (d->byte < d->len);
}

**awsdert** · 07-25-2021

Originally Posted by hamster_nz

I dpn't know if it is of any value, but here is my prototype for getting and consuming bits from the compressed bitstream.

Code:

static uint32_t grab_bits(struct Inflate_data *d) {
   uint32_t bits=0;
   // Take the next 24 bits out of the data buffer
   if(d->byte+0 < d->len) bits |= d->data[d->byte+0];
   if(d->byte+1 < d->len) bits |= d->data[d->byte+1] << 8;
   if(d->byte+2 < d->len) bits |= d->data[d->byte+2] << 16;
   if(d->bit == 0) return bits; // Don't need the 4th byte
   if(d->byte+3 < d->len) bits |= d->data[d->byte+3] << 24;
   return bits >> d->bit;
}


int consume_bits(struct Inflate_data *d, uint8_t count) {
   d->bit  += count;
   d->byte += d->bit>>3;
   d->bit  &= 0x7;
   if(verbose)
     printf("Consuming %i bits, now %d:%d\n",count, d->byte,d->bit);
   return (d->byte < d->len);
}

Here's a tip:

Code:

byte = pos / CHAR_BIT;
bit = pos % CHAR_BIT;

**hamster_nz** · 07-25-2021

Originally Posted by awsdert

Here's a tip:

Code:

byte = pos / CHAR_BIT;
bit = pos % CHAR_BIT;

It's true what they say, premature optimization is the source of all evil

....

(i.e. I am doing evil using >>3 and &7 )

**awsdert** · 07-26-2021

Originally Posted by hamster_nz

It's true what they say, premature optimization is the source of all evil

....

(i.e. I am doing evil using >>3 and &7 )

I wouldn't call it a source of evil but in this case I would call it more confusing than need be, possibly slower too due to multiple if statements, as a side note I this is what I chose for bit stream management:

Code:

uint IncStreamCount( STREAM *stream, uint bits )
{
	uint rem = (stream->fed >= bits) ? bits : 0;
	stream->got >>= rem;
	stream->fed -= rem;
	stream->num += rem;
	return rem;
}

uintmax_t CopyStreamBits( STREAM *stream, uint bits, bool inc_count )
{
	uintmax_t val = 0;

	if ( stream->fed < bitsof(uintmax_t) )
	{
		uint get = bitsof(uintmax_t) - stream->fed;

		get = (get <= stream->max - stream->pos)
			? get : stream->max - stream->pos;

		val = GetBits
		(
			stream->ptr,
			stream->max,
			stream->pos,
			get,
			false,
			false
		);

		stream->got |= val << (bitsof(uintmax_t) - get);
		stream->fed += get;
		stream->pos += get;
	}

	val = stream->got & ~(~0ULL << bits);

	if ( inc_count )
		IncStreamCount( stream, bits );

	return val;
}

Using a separate variable for current position versus stream position made it easier to retrieve bits multiple times before moving the stream pointer after identify what was what.

Edit: That "backwards" parameter of GetBits() was used for retrieving the target buffer size at the end of the *.gz file, the left_shifted parameter was solely for instances where I didn't want to complicate the local code further, it's already got unwieldly as it is (plan to clean up after), don't need it getting any further unwieldly.

**awsdert** · 07-26-2021

The comment system of the mentioned documentation in a previous post is lacking when it comes to preformatted text o posting it here than linking to it in a comment on the document.

If anyone happens to see any misinterpretations please do let me know.

Link to document:
Understanding gzip

Section I'm following as best I can:
Dynamic Huffman: Parsing the Huffman tree

Output:

Code:

./a.out aba.gz
path = 'aba.gz'
PrintStreamDetails( 0x7fff6097e150 ): ptr = 0x55af47400480, pos = 0, num = 0, max = 328, fed = 0
PrintBytes( 0x55af47400480, 41, 16 )
1F 8B  0  0  0  0  0  0  0  3 1D C6 49  1  0  0
40 C0 AC A3 7F 88 3D 3C 20 2A 97 9D 37 5E 1D  C
29 34 94 23  0  0  0
PrintStreamDetails( 0x7fff6097e150 ): ptr = 0x55af47400480, pos = 136, num = 80, max = 328, fed = 56
gzip.magic = 1F8B, gzip.format =  0, gzip.flags =  0, gzip.mtime =        0, zlib.xflags =        0, zlib.system =        3
gzip.flag_TEXT = false, gzip.flag_HCRC = false, gzip.flag_MORE = false, gzip.flag_NAME = false, gzip.flag_NOTE = false, gzip.flag_RESERVED = 0
PrintStreamDetails( 0x7fff6097e150 ): ptr = 0x55af47400480, pos = 136, num = 80, max = 328, fed = 56
last = true, type = 2
lengc = 260, distc = 7, codec = 18, left = 60
pos = 212, max = 328, count = 267
left = 61, byte = 18, bit = 7
Code Table:
main.c:647: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
main.c:647: _list[ 1]: from = 18, more = 7, copy = 11, leng = 2, fill = 'ÿ', code = 10
main.c:647: _list[ 2]: from =  1, more = 0, copy =  0, leng = 4, fill = 'ÿ', code = 1100
main.c:647: _list[ 3]: from =  4, more = 0, copy =  0, leng = 4, fill = 'ÿ', code = 1101
main.c:647: _list[ 4]: from = 16, more = 2, copy =  3, leng = 4, fill = 'ÿ', code = 1110
main.c:647: _list[ 5]: from = 17, more = 3, copy =  3, leng = 4, fill = 'ÿ', code = 1111
main.c:742: _list[ 1]: from = 18, more = 7, copy = 11, leng = 2, fill = 'ÿ', code = 10
i = 0010 j = 1, copy = 97
lit = 'ÿ' (0), sym = 'ÿ' (0)
...
lit = '`' (96), sym = 'ÿ' (0)
main.c:742: _list[ 2]: from =  1, more = 0, copy =  0, leng = 4, fill = 'ÿ', code = 1100
lit = 'a' (97), sym = 'a' (97)
main.c:742: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
lit = 'b' (98), sym = 'b' (98)
main.c:742: _list[ 1]: from = 18, more = 7, copy = 11, leng = 2, fill = 'ÿ', code = 10
i = 0010 j = 1, copy = 138
lit = 'c' (99), sym = 'ÿ' (0)
...
lit = 'ì' (236), sym = 'ÿ' (0)
main.c:742: _list[ 1]: from = 18, more = 7, copy = 11, leng = 2, fill = 'ÿ', code = 10
i = 0010 j = 1, copy = 19
lit = 'í' (237), sym = 'ÿ' (0)
...
lit = 'ÿ' (255), sym = 'ÿ' (0)
main.c:742: _list[ 3]: from =  4, more = 0, copy =  0, leng = 4, fill = 'ÿ', code = 1101
lit = '
' (256), sym = '
' (256)
main.c:742: _list[ 4]: from = 16, more = 2, copy =  3, leng = 4, fill = 'ÿ', code = 1110
i = 1110 j = 4, copy = 3
lit = '' (257), sym = '' (257)
lit = '' (258), sym = '' (257)
lit = '' (259), sym = '' (257)
main.c:742: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
lit = '' (260), sym = '' (260)
main.c:742: _list[ 5]: from = 17, more = 3, copy =  3, leng = 4, fill = 'ÿ', code = 1111
i = 1111 j = 5, copy = 3
lit = '' (261), sym = 'ÿ' (0)
lit = '' (262), sym = 'ÿ' (0)
lit = '' (263), sym = 'ÿ' (0)
main.c:742: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
lit = '' (264), sym = '' (264)
main.c:742: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
lit = '	' (265), sym = '	' (265)
main.c:742: _list[ 0]: from =  2, more = 0, copy =  0, leng = 1, fill = 'ÿ', code = 0
lit = '
' (266), sym = '
' (266)
PrintBytes( 0x55af474034a0, 0, 16 )

Handler:

Code:

void * RdHuffmanLengDistCodes
(
	STREAM *stream,
	BUFF *CodeWords,
	BUFF *Symbols,
	uint count,
	uint max_leng,
	uint start_with_literal
)
{
	uint prv = 0, lit = start_with_literal, pos = 0;
	CODEWORD *codewords = CodeWords->addr;
	HUFFMAN_SYMBOL *symbols, *symbol;

	symbols = Symbols->addr = calloc( count + 1, sizeof(HUFFMAN_SYMBOL) );

	if ( !symbols )
		return NULL;

	while ( pos < count && stream->num < stream->max )
	{
		CODEWORD *word = NULL;
		uint i = 0, j = 0, cur_leng = 1, copy = 1;
		uintmax_t val;

		for ( cur_leng = 1; cur_leng <= max_leng; ++cur_leng )
		{
			i = RevBits( CopyStreamBits( stream, cur_leng, false ), cur_leng );

			for ( j = 0; j < list.used; ++j )
			{
				word = codewords + j;

				if ( word->leng == cur_leng && word->code == i )
				{
					IncStreamCount( stream, cur_leng );
					break;
				}
			}

			if ( j < list.used )
				break;
		}

		if ( cur_leng > max_leng )
		{
			printf
			(
				"i = %u, j = %u, list.used = %u, cur_leng = %u\n",
				i, j, CodeWords->used, cur_leng
			);
			return NULL;
		}

		val = CopyStreamBits( stream, word->more, true );

		PrintCodeWord( word, __LINE__, j );

		if ( word->from >= 16 )
		{
			copy = (uint)(word->copy + val);
			printf( "i = " );
			SeeBits( &i, max_leng );
			printf( " j = %u, copy = %u\n", j, copy );
		}

		for ( uint s = 0; s < copy; ++s, ++pos, ++lit )
		{
			uint sym = 0;

			if ( word->from == 16 )
				sym = prv;
			else if ( word->from < 16 )
				sym = lit;

			symbol = symbols + pos;
			symbol->len = word->leng;
			symbol->sym = sym;
			symbol->val = pos;
			printf
			(
				"lit = '%c' (%u), sym = '%c' (%u)\n",
				lit ? lit : -1, lit,
				sym ? sym : -1, sym
			);
		}

		prv = lit;
	}

	return symbols;
}

How it's called:

Code:

			puts("Code Table:\n");

			for ( uint j = 0; j < list.used; ++j )
			{
				CODEWORD *word = _list + j;
				PrintCodeWord( word, __LINE__, j );
			}

			leng_symbols = RdHuffmanLengDistCodes
			(
				&stream,
				&list,
				&tree,
				lengc,
				max_leng,
				0
			);

			if ( !leng_symbols )
				return Return( ret, 1 );

			dist_symbols = RdHuffmanLengDistCodes
			(
				&stream,
				&list,
				&distances,
				distc,
				max_leng,
				lengc
			);

			if ( !dist_symbols )
				return Return( ret, 1 );
		}

**awsdert** · 07-26-2021

If anyone WAS trying to help (which I currently doubt) I found the information I needed:

RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3

HDIST + 1 code lengths for the distance alphabet,

encoded using the code length Huffman code

That was way to easy to overlook, they need more colour coding... and a better naming scheme

**hamster_nz** · 07-27-2021

Here's the decoding of the first part of the first 'dynamic dictionary' data block in the image in the original post (the first uses a fixed dictionary), in preparation for reading the actual full dictionary:

Code:

Last block = false
Compressed (dynamic dictionary)
  HLIT 286, HDIST 20, HCLEN 18
     0
     3
     4
     2
     0
     0
     0
     3
     0
     2
     0
     4
     0
     5
     0
     4
     0
     5
Code length table
=================


    0: 2  00
    1: 5  11110
    2: 4  1100
    3: 5  11111
    4: 4  1101
    5: 2  01
    6: 3  100
    7: 0
    8: 0
    9: 0
   10: 0
   11: 0
   12: 0
   13: 0
   14: 0
   15: 0
   16: 0
   17: 3  101
   18: 4  1110

**hamster_nz** · 07-28-2021

I've got the hard parts of the "deflate" uncompressing written - now seems to inflate the original PNG data just fine (although I've still got to verify the ADLER32 checksum.

Once again, maybe it will help clear up ambiguities in the spec...