forked from CTCaer/hekate
547 lines
17 KiB
C
547 lines
17 KiB
C
|
//
|
||
|
// Name: lz.c
|
||
|
// Author: Marcus Geelnard
|
||
|
// Description: LZ77 coder/decoder implementation.
|
||
|
// Reentrant: Yes
|
||
|
// $ATH_LICENSE_NULL$
|
||
|
//
|
||
|
// The LZ77 compression scheme is a substitutional compression scheme
|
||
|
// proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in
|
||
|
// its design, and uses no fancy bit level compression.
|
||
|
//
|
||
|
// This is my first attempt at an implementation of a LZ77 code/decoder.
|
||
|
//
|
||
|
// The principle of the LZ77 compression algorithm is to store repeated
|
||
|
// occurrences of strings as references to previous occurrences of the same
|
||
|
// string. The point is that the reference consumes less space than the
|
||
|
// string itself, provided that the string is long enough (in this
|
||
|
// implementation, the string has to be at least 4 bytes long, since the
|
||
|
// minimum coded reference is 3 bytes long). Also note that the term
|
||
|
// "string" refers to any kind of byte sequence (it does not have to be
|
||
|
// an ASCII string, for instance).
|
||
|
//
|
||
|
// The coder uses a brute force approach to finding string matches in the
|
||
|
// history buffer (or "sliding window", if you wish), which is very, very
|
||
|
// slow. I recon the complexity is somewhere between O(n^2) and O(n^3),
|
||
|
// depending on the input data.
|
||
|
//
|
||
|
// There is also a faster implementation that uses a large working buffer
|
||
|
// in which a "jump table" is stored, which is used to quickly find
|
||
|
// possible string matches (see the source code for LZ_CompressFast() for
|
||
|
// more information). The faster method is an order of magnitude faster,
|
||
|
// but still quite slow compared to other compression methods.
|
||
|
//
|
||
|
// The upside is that decompression is very fast, and the compression ratio
|
||
|
// is often very good.
|
||
|
//
|
||
|
// The reference to a string is coded as a (length,offset) pair, where the
|
||
|
// length indicates the length of the string, and the offset gives the
|
||
|
// offset from the current data position. To distinguish between string
|
||
|
// references and literal strings (uncompressed bytes), a string reference
|
||
|
// is preceded by a marker byte, which is chosen as the least common byte
|
||
|
// symbol in the input data stream (this marker byte is stored in the
|
||
|
// output stream as the first byte).
|
||
|
//
|
||
|
// Occurrences of the marker byte in the stream are encoded as the marker
|
||
|
// byte followed by a zero byte, which means that occurrences of the marker
|
||
|
// byte have to be coded with two bytes.
|
||
|
//
|
||
|
// The lengths and offsets are coded in a variable length fashion, allowing
|
||
|
// values of any magnitude (up to 4294967295 in this implementation).
|
||
|
//
|
||
|
// With this compression scheme, the worst case compression result is
|
||
|
// (257/256)*insize + 1.
|
||
|
//
|
||
|
//------------------------------------------------------------------------
|
||
|
// Copyright (c) 2003-2006 Marcus Geelnard
|
||
|
//
|
||
|
// This software is provided 'as-is', without any express or implied
|
||
|
// warranty. In no event will the authors be held liable for any damages
|
||
|
// arising from the use of this software.
|
||
|
//
|
||
|
// Permission is granted to anyone to use this software for any purpose,
|
||
|
// including commercial applications, and to alter it and redistribute it
|
||
|
// freely, subject to the following restrictions:
|
||
|
//
|
||
|
// 1. The origin of this software must not be misrepresented; you must not
|
||
|
// claim that you wrote the original software. If you use this software
|
||
|
// in a product, an acknowledgment in the product documentation would
|
||
|
// be appreciated but is not required.
|
||
|
//
|
||
|
// 2. Altered source versions must be plainly marked as such, and must not
|
||
|
// be misrepresented as being the original software.
|
||
|
//
|
||
|
// 3. This notice may not be removed or altered from any source
|
||
|
// distribution.
|
||
|
//
|
||
|
// Marcus Geelnard
|
||
|
// marcus.geelnard at home.se
|
||
|
//
|
||
|
|
||
|
//
|
||
|
// This file has been altered from the original version.
|
||
|
//
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Constants used for LZ77 coding
|
||
|
*************************************************************************/
|
||
|
|
||
|
/* Maximum offset (can be any size < 2^31). Lower values give faster
|
||
|
compression, while higher values gives better compression. The default
|
||
|
value of 100000 is quite high. Experiment to see what works best for
|
||
|
you. */
|
||
|
#define LZ_MAX_OFFSET 100000
|
||
|
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* INTERNAL FUNCTIONS *
|
||
|
*************************************************************************/
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* _LZ_StringCompare() - Return maximum length string match.
|
||
|
*************************************************************************/
|
||
|
|
||
|
static unsigned int _LZ_StringCompare( unsigned char * str1,
|
||
|
unsigned char * str2, unsigned int minlen, unsigned int maxlen )
|
||
|
{
|
||
|
unsigned int len;
|
||
|
|
||
|
for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len );
|
||
|
|
||
|
return len;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* _LZ_WriteVarSize() - Write unsigned integer with variable number of
|
||
|
* bytes depending on value.
|
||
|
*************************************************************************/
|
||
|
|
||
|
static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf )
|
||
|
{
|
||
|
unsigned int y;
|
||
|
int num_bytes, i, b;
|
||
|
|
||
|
/* Determine number of bytes needed to store the number x */
|
||
|
y = x >> 3;
|
||
|
for( num_bytes = 5; num_bytes >= 2; -- num_bytes )
|
||
|
{
|
||
|
if( y & 0xfe000000 ) break;
|
||
|
y <<= 7;
|
||
|
}
|
||
|
|
||
|
/* Write all bytes, seven bits in each, with 8:th bit set for all */
|
||
|
/* but the last byte. */
|
||
|
for( i = num_bytes-1; i >= 0; -- i )
|
||
|
{
|
||
|
b = (x >> (i*7)) & 0x0000007f;
|
||
|
if( i > 0 )
|
||
|
{
|
||
|
b |= 0x00000080;
|
||
|
}
|
||
|
*buf ++ = (unsigned char) b;
|
||
|
}
|
||
|
|
||
|
/* Return number of bytes written */
|
||
|
return num_bytes;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* _LZ_ReadVarSize() - Read unsigned integer with variable number of
|
||
|
* bytes depending on value.
|
||
|
*************************************************************************/
|
||
|
|
||
|
static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf )
|
||
|
{
|
||
|
unsigned int y, b, num_bytes;
|
||
|
|
||
|
/* Read complete value (stop when byte contains zero in 8:th bit) */
|
||
|
y = 0;
|
||
|
num_bytes = 0;
|
||
|
do
|
||
|
{
|
||
|
b = (unsigned int) (*buf ++);
|
||
|
y = (y << 7) | (b & 0x0000007f);
|
||
|
++ num_bytes;
|
||
|
}
|
||
|
while( b & 0x00000080 );
|
||
|
|
||
|
/* Store value in x */
|
||
|
*x = y;
|
||
|
|
||
|
/* Return number of bytes read */
|
||
|
return num_bytes;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* PUBLIC FUNCTIONS *
|
||
|
*************************************************************************/
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* LZ_Compress() - Compress a block of data using an LZ77 coder.
|
||
|
* in - Input (uncompressed) buffer.
|
||
|
* out - Output (compressed) buffer. This buffer must be 0.4% larger
|
||
|
* than the input buffer, plus one byte.
|
||
|
* insize - Number of input bytes.
|
||
|
* The function returns the size of the compressed data.
|
||
|
*************************************************************************/
|
||
|
|
||
|
int LZ_Compress( unsigned char *in, unsigned char *out,
|
||
|
unsigned int insize )
|
||
|
{
|
||
|
unsigned char marker, symbol;
|
||
|
unsigned int inpos, outpos, bytesleft, i;
|
||
|
unsigned int maxoffset, offset, bestoffset;
|
||
|
unsigned int maxlength, length, bestlength;
|
||
|
unsigned int histogram[ 256 ];
|
||
|
unsigned char *ptr1, *ptr2;
|
||
|
|
||
|
/* Do we have anything to compress? */
|
||
|
if( insize < 1 )
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Create histogram */
|
||
|
for( i = 0; i < 256; ++ i )
|
||
|
{
|
||
|
histogram[ i ] = 0;
|
||
|
}
|
||
|
for( i = 0; i < insize; ++ i )
|
||
|
{
|
||
|
++ histogram[ in[ i ] ];
|
||
|
}
|
||
|
|
||
|
/* Find the least common byte, and use it as the marker symbol */
|
||
|
marker = 0;
|
||
|
for( i = 1; i < 256; ++ i )
|
||
|
{
|
||
|
if( histogram[ i ] < histogram[ marker ] )
|
||
|
{
|
||
|
marker = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Remember the marker symbol for the decoder */
|
||
|
out[ 0 ] = marker;
|
||
|
|
||
|
/* Start of compression */
|
||
|
inpos = 0;
|
||
|
outpos = 1;
|
||
|
|
||
|
/* Main compression loop */
|
||
|
bytesleft = insize;
|
||
|
do
|
||
|
{
|
||
|
/* Determine most distant position */
|
||
|
if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET;
|
||
|
else maxoffset = inpos;
|
||
|
|
||
|
/* Get pointer to current position */
|
||
|
ptr1 = &in[ inpos ];
|
||
|
|
||
|
/* Search history window for maximum length string match */
|
||
|
bestlength = 3;
|
||
|
bestoffset = 0;
|
||
|
for( offset = 3; offset <= maxoffset; ++ offset )
|
||
|
{
|
||
|
/* Get pointer to candidate string */
|
||
|
ptr2 = &ptr1[ -(int)offset ];
|
||
|
|
||
|
/* Quickly determine if this is a candidate (for speed) */
|
||
|
if( (ptr1[ 0 ] == ptr2[ 0 ]) &&
|
||
|
(ptr1[ bestlength ] == ptr2[ bestlength ]) )
|
||
|
{
|
||
|
/* Determine maximum length for this offset */
|
||
|
maxlength = (bytesleft < offset ? bytesleft : offset);
|
||
|
|
||
|
/* Count maximum length match at this offset */
|
||
|
length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength );
|
||
|
|
||
|
/* Better match than any previous match? */
|
||
|
if( length > bestlength )
|
||
|
{
|
||
|
bestlength = length;
|
||
|
bestoffset = offset;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Was there a good enough match? */
|
||
|
if( (bestlength >= 8) ||
|
||
|
((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
|
||
|
((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
|
||
|
((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
|
||
|
((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
|
||
|
{
|
||
|
out[ outpos ++ ] = (unsigned char) marker;
|
||
|
outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
|
||
|
outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
|
||
|
inpos += bestlength;
|
||
|
bytesleft -= bestlength;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Output single byte (or two bytes if marker byte) */
|
||
|
symbol = in[ inpos ++ ];
|
||
|
out[ outpos ++ ] = symbol;
|
||
|
if( symbol == marker )
|
||
|
{
|
||
|
out[ outpos ++ ] = 0;
|
||
|
}
|
||
|
-- bytesleft;
|
||
|
}
|
||
|
}
|
||
|
while( bytesleft > 3 );
|
||
|
|
||
|
/* Dump remaining bytes, if any */
|
||
|
while( inpos < insize )
|
||
|
{
|
||
|
if( in[ inpos ] == marker )
|
||
|
{
|
||
|
out[ outpos ++ ] = marker;
|
||
|
out[ outpos ++ ] = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
out[ outpos ++ ] = in[ inpos ];
|
||
|
}
|
||
|
++ inpos;
|
||
|
}
|
||
|
|
||
|
return outpos;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* LZ_CompressFast() - Compress a block of data using an LZ77 coder.
|
||
|
* in - Input (uncompressed) buffer.
|
||
|
* out - Output (compressed) buffer. This buffer must be 0.4% larger
|
||
|
* than the input buffer, plus one byte.
|
||
|
* insize - Number of input bytes.
|
||
|
* work - Pointer to a temporary buffer (internal working buffer), which
|
||
|
* must be able to hold (insize+65536) unsigned integers.
|
||
|
* The function returns the size of the compressed data.
|
||
|
*************************************************************************/
|
||
|
|
||
|
int LZ_CompressFast( unsigned char *in, unsigned char *out,
|
||
|
unsigned int insize, unsigned int *work )
|
||
|
{
|
||
|
unsigned char marker, symbol;
|
||
|
unsigned int inpos, outpos, bytesleft, i, index, symbols;
|
||
|
unsigned int offset, bestoffset;
|
||
|
unsigned int maxlength, length, bestlength;
|
||
|
unsigned int histogram[ 256 ], *lastindex, *jumptable;
|
||
|
unsigned char *ptr1, *ptr2;
|
||
|
|
||
|
/* Do we have anything to compress? */
|
||
|
if( insize < 1 )
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Assign arrays to the working area */
|
||
|
lastindex = work;
|
||
|
jumptable = &work[ 65536 ];
|
||
|
|
||
|
/* Build a "jump table". Here is how the jump table works:
|
||
|
jumptable[i] points to the nearest previous occurrence of the same
|
||
|
symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and
|
||
|
in[i+1] == in[jumptable[i]+1], and so on... Following the jump table
|
||
|
gives a dramatic boost for the string search'n'match loop compared
|
||
|
to doing a brute force search. The jump table is built in O(n) time,
|
||
|
so it is a cheap operation in terms of time, but it is expensice in
|
||
|
terms of memory consumption. */
|
||
|
for( i = 0; i < 65536; ++ i )
|
||
|
{
|
||
|
lastindex[ i ] = 0xffffffff;
|
||
|
}
|
||
|
for( i = 0; i < insize-1; ++ i )
|
||
|
{
|
||
|
symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]);
|
||
|
index = lastindex[ symbols ];
|
||
|
lastindex[ symbols ] = i;
|
||
|
jumptable[ i ] = index;
|
||
|
}
|
||
|
jumptable[ insize-1 ] = 0xffffffff;
|
||
|
|
||
|
/* Create histogram */
|
||
|
for( i = 0; i < 256; ++ i )
|
||
|
{
|
||
|
histogram[ i ] = 0;
|
||
|
}
|
||
|
for( i = 0; i < insize; ++ i )
|
||
|
{
|
||
|
++ histogram[ in[ i ] ];
|
||
|
}
|
||
|
|
||
|
/* Find the least common byte, and use it as the marker symbol */
|
||
|
marker = 0;
|
||
|
for( i = 1; i < 256; ++ i )
|
||
|
{
|
||
|
if( histogram[ i ] < histogram[ marker ] )
|
||
|
{
|
||
|
marker = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Remember the marker symbol for the decoder */
|
||
|
out[ 0 ] = marker;
|
||
|
|
||
|
/* Start of compression */
|
||
|
inpos = 0;
|
||
|
outpos = 1;
|
||
|
|
||
|
/* Main compression loop */
|
||
|
bytesleft = insize;
|
||
|
do
|
||
|
{
|
||
|
/* Get pointer to current position */
|
||
|
ptr1 = &in[ inpos ];
|
||
|
|
||
|
/* Search history window for maximum length string match */
|
||
|
bestlength = 3;
|
||
|
bestoffset = 0;
|
||
|
index = jumptable[ inpos ];
|
||
|
while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) )
|
||
|
{
|
||
|
/* Get pointer to candidate string */
|
||
|
ptr2 = &in[ index ];
|
||
|
|
||
|
/* Quickly determine if this is a candidate (for speed) */
|
||
|
if( ptr2[ bestlength ] == ptr1[ bestlength ] )
|
||
|
{
|
||
|
/* Determine maximum length for this offset */
|
||
|
offset = inpos - index;
|
||
|
maxlength = (bytesleft < offset ? bytesleft : offset);
|
||
|
|
||
|
/* Count maximum length match at this offset */
|
||
|
length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength );
|
||
|
|
||
|
/* Better match than any previous match? */
|
||
|
if( length > bestlength )
|
||
|
{
|
||
|
bestlength = length;
|
||
|
bestoffset = offset;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Get next possible index from jump table */
|
||
|
index = jumptable[ index ];
|
||
|
}
|
||
|
|
||
|
/* Was there a good enough match? */
|
||
|
if( (bestlength >= 8) ||
|
||
|
((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
|
||
|
((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
|
||
|
((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
|
||
|
((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
|
||
|
{
|
||
|
out[ outpos ++ ] = (unsigned char) marker;
|
||
|
outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
|
||
|
outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
|
||
|
inpos += bestlength;
|
||
|
bytesleft -= bestlength;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Output single byte (or two bytes if marker byte) */
|
||
|
symbol = in[ inpos ++ ];
|
||
|
out[ outpos ++ ] = symbol;
|
||
|
if( symbol == marker )
|
||
|
{
|
||
|
out[ outpos ++ ] = 0;
|
||
|
}
|
||
|
-- bytesleft;
|
||
|
}
|
||
|
}
|
||
|
while( bytesleft > 3 );
|
||
|
|
||
|
/* Dump remaining bytes, if any */
|
||
|
while( inpos < insize )
|
||
|
{
|
||
|
if( in[ inpos ] == marker )
|
||
|
{
|
||
|
out[ outpos ++ ] = marker;
|
||
|
out[ outpos ++ ] = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
out[ outpos ++ ] = in[ inpos ];
|
||
|
}
|
||
|
++ inpos;
|
||
|
}
|
||
|
|
||
|
return outpos;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder.
|
||
|
* in - Input (compressed) buffer.
|
||
|
* out - Output (uncompressed) buffer. This buffer must be large
|
||
|
* enough to hold the uncompressed data.
|
||
|
* insize - Number of input bytes.
|
||
|
*************************************************************************/
|
||
|
|
||
|
int LZ_Uncompress( unsigned char *in, unsigned char *out,
|
||
|
unsigned int insize )
|
||
|
{
|
||
|
unsigned char marker, symbol;
|
||
|
unsigned int i, inpos, outpos, length, offset;
|
||
|
|
||
|
/* Do we have anything to uncompress? */
|
||
|
if( insize < 1 )
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Get marker symbol from input stream */
|
||
|
marker = in[ 0 ];
|
||
|
inpos = 1;
|
||
|
|
||
|
/* Main decompression loop */
|
||
|
outpos = 0;
|
||
|
do
|
||
|
{
|
||
|
symbol = in[ inpos ++ ];
|
||
|
if( symbol == marker )
|
||
|
{
|
||
|
/* We had a marker byte */
|
||
|
if( in[ inpos ] == 0 )
|
||
|
{
|
||
|
/* It was a single occurrence of the marker byte */
|
||
|
out[ outpos ++ ] = marker;
|
||
|
++ inpos;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Extract true length and offset */
|
||
|
inpos += _LZ_ReadVarSize( &length, &in[ inpos ] );
|
||
|
inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] );
|
||
|
|
||
|
/* Copy corresponding data from history window */
|
||
|
for( i = 0; i < length; ++ i )
|
||
|
{
|
||
|
out[ outpos ] = out[ outpos - offset ];
|
||
|
++ outpos;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* No marker, plain copy */
|
||
|
out[ outpos ++ ] = symbol;
|
||
|
}
|
||
|
}
|
||
|
while( inpos < insize );
|
||
|
|
||
|
return outpos;
|
||
|
}
|