Hello,
Well, it's been a boring start to the easter holidays for me, so I thought I'd learn me a new l33t skill:- data compression. Having messed around with RLE lots in the past, and touched upon Huffman compression, I decided to look at what is arguably at the core of all modern data compression standards:- LZ77. An oldie but a goodie?
The way this program works is thus: It check the input data against a buffer of previous unique input data to see if there's a match. If there is, it writes a 16-bit code to the output file with this format:-
Code:
1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
| \_____________/ \___________/
Flag Offset Length
The flag indicates compressed data, the offset is 8 bits and the length of the uncompressed data 7 bits. Of course, if there isn't a match, it'll just output 16 zero bits and then the uncompressed data, which is a bit of a waste, but I'm not getting into bit streams just right now...
Anyway, I don't think I've got the concept of the "sliding window" right, in that I haven't defined a size or anything. This deffo works for short text files, but probably not for anything else just yet. Could anyone who's poked around this algorithm before give me some tips?
Code:
#include <stdio.h>
int main(int argc, char *argv[])
{
int iFlag, iDecompress = 0, iFileArg = 1, i, iLimit;
unsigned char *buffer, *bcursor, *window, *wcursor, *wsearch;
unsigned long ulSize;
unsigned short usHeader;
FILE *in, *out;
if (argc < 2 || argc > 3)
return -1;
if (strcmp(argv[1], "-d") == 0)
{
iDecompress = 1;
iFileArg = 2;
}
in = fopen(argv[iFileArg], "rb");
fseek(in, 0, SEEK_END);
ulSize = ftell(in);
fseek(in, 0, SEEK_SET);
buffer = malloc(ulSize);
bcursor = buffer;
window = malloc(ulSize);
wcursor = window;
fread(buffer, ulSize, 1, in);
fclose(in);
if (iDecompress)
{
sprintf(argv[iFileArg] + strlen(argv[iFileArg]) - 3, "doc");
out = fopen(argv[iFileArg], "wb");
while (bcursor - buffer < ulSize)
{
usHeader = *(unsigned short *)bcursor;
bcursor += 2;
if (usHeader & 0x8000)
{
wsearch = window + ((usHeader & 0x7F80) >> 7);
fwrite(wsearch, usHeader & 0x7F, 1, out);
}
else
{
putc(*bcursor, out);
*(wcursor++) = *(bcursor++);
}
}
}
else
{
sprintf(argv[iFileArg] + strlen(argv[iFileArg]) - 3, "lz7");
out = fopen(argv[iFileArg], "wb");
while (bcursor - buffer < ulSize)
{
iFlag = 0;
iLimit = wcursor - window;
if (iLimit > 2)
{
if (iLimit > 128)
iLimit = 128;
for (i=3;i<iLimit;i++)
{
for (wsearch=window;wsearch<=wcursor;wsearch++)
{
if ((wcursor - wsearch) < i)
break;
if (memcmp(wsearch, bcursor, i) == 0)
{
iFlag = 1;
usHeader = (unsigned short)0x8000 | ((wsearch - window) << 7) | i;
fwrite(&usHeader, sizeof(usHeader), 1, out);
bcursor += i;
break;
}
}
}
}
if (!iFlag)
{
usHeader = 0;
fwrite(&usHeader, sizeof(usHeader), 1, out);
putc(*bcursor, out);
*(wcursor++) = *(bcursor++);
}
}
}
free(window);
free(buffer);
fclose(out);
return 0;
}