LZ77 基本概述-繁依Fanyi

LZ77 编码简介

  • LZ 编码由以色列研究者 Jacob Ziv 和 Abraham Lempel 提出,是无损压缩的核心思想。LZ 是一个系列的算法,而其中最基本的就是两人在 1977年所发表的论文《A Universal Algorithm for Sequential Compression》 中提出的 LZ77 算法。
  • LZ77 压缩是一种基于字典及滑动窗口的无损压缩技术,广泛应用于通信传输。
  • LZ77 算法不同于 Huffman 编码等基于统计的数据压缩编码,需要得到先验知识——信源的字符频率,然后进行压缩。
  • LZ77的核心思想:利用数据的重复结构信息来进行数据压缩。

LZ77 的基本原理

  • LZ77 以经常出现的字母组合(或较长的字符串)构建字典中的数据项,并且使用较短的数字(或符号)编码来代替比较复杂的数据项。数据压缩时,将从待压缩数据中读入的源数据与字典中的数据项进行匹配,从中检索出相应的代码并输出。从而实现数据的压缩
  • 在 LZ77 方法中,词典就是先前已编码序列的一部分。编码器通过一个滑动窗口来查看输入序列,如下图所示。

在这里插入图片描述

  • 这个滑动窗口包括两个部分:查找缓冲区(Search Buffer) 和 先行缓冲区(Look Ahead Buffer)
    • 查找缓冲区:包含了最近已编码序列的一部分
    • 先行缓冲区:包含待编码序列的下一部分
      这里查找缓冲区包含了 8 个符号,先行缓冲区包含 7 个符号。但在实际情况中,缓冲区要大很多。

三元组参数解释:

  • ① 匹配指针 先在 查找缓冲区 中找到移动指针,知道找到与先行缓冲区第一个字符a相匹配字符a。此时该指针与先行缓冲区的距离称为 偏移量(off) 。这里的 偏移量off 就是7。
  • ② 编码器之后查看指针位置之后的符号,查看其是否与先行缓冲区的符号相匹配。从第一个符号(匹配指针以开始所指向的位置)开始,与先行缓冲区的符号匹配,匹配到的连续符号的长度称为 匹配长度(len)。例如这里,从匹配指针所指的位置开始的符号串 abra 与 先行缓冲区中的符号串 abra 相匹配,下一位查找缓冲区的 x 与 先行缓冲区 a 不匹配,所以这里的 匹配长度len 是 4.。
  • ③ 编码器在查找缓冲区中搜素最长匹配串。找到最长的匹配串后,编码器即可用三元组 对其进行编码。这里 off 是偏移量 ,len 是匹配长度,c 是先行缓冲区中跟在该匹配项串之后的符号的码字。例如这里 匹配串 是 abra,则先行缓冲区匹配串后的码字是 r

LZ77 算法

  • LZ77 算法执行流程如下:
    步骤 1:从输入的待压缩数据的起始位置,读取未编码的源数据,从滑动窗口的字典数据项中查找最长的匹配字符串。若结果为 T,则执行步骤 2,若结果为 F,则执行步骤 3;
    步骤 2:输出函数 F(off,len,c)。然后将窗口向后滑动到 len++,继续步骤 1;
    步骤 3:输出函数 F(0,0,c),其中 c 为下一个字符。并且窗口向后滑动(len + 1)个字符,执行步骤 1。

  • 下面是代码实现:

    /* PROG1.C                                                    */
    /* Simple Hashing LZ77 Sliding Dictionary Compression Program */
    /* By Rich Geldreich, Jr. October, 1993                       */
    /* Originally compiled with QuickC v2.5 in the small model.   */
    #include 
    #include 
    #include 
    #include 
    
    /* set this to 1 for a greedy encoder */
    #define GREEDY    0
    
    /* ratio vs. speed constant */
    /* the larger this constant, the better the compression */
    #define MAXCOMPARES 75
    
    /* unused entry flag */
    #define NIL       0xFFFF
    
    /* bits per symbol- normally 8 for general purpose compression */
    #define CHARBITS  8
    
    /* minimum match length & maximum match length */
    #define THRESHOLD 2
    #define MATCHBITS 4
    #define MAXMATCH  ((1 << MATCHBITS) + THRESHOLD - 1)
    
    /* sliding dictionary size and hash table's size */
    /* some combinations of HASHBITS and THRESHOLD values will not work
             correctly because of the way this program hashes strings */
    #define DICTBITS  13
    #define HASHBITS  10
    #define DICTSIZE  (1 << DICTBITS)
    #define HASHSIZE  (1 << HASHBITS)
    
    /* # bits to shift after each XOR hash */
    /* this constant must be high enough so that only THRESHOLD + 1
             characters are in the hash accumulator at one time */
    #define SHIFTBITS ((HASHBITS + THRESHOLD) / (THRESHOLD + 1))
    
    /* sector size constants */
    #define SECTORBIT 10
    #define SECTORLEN (1 << SECTORBIT)
    #define SECTORAND ((0xFFFF << SECTORBIT) & 0xFFFF)
    
    /* dictionary plus MAXMATCH extra chars for string comparisions */
    unsigned char
            dict[DICTSIZE + MAXMATCH];
    
    /* hashtable & link list table */
    unsigned int
            hash[HASHSIZE],
            nextlink[DICTSIZE];
    
    /* misc. global variables */
    unsigned int
            matchlength,
            matchpos,
            bitbuf,
            bitsin,
            masks[17] = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
    
    FILE *infile, *outfile;
    
    /* writes multiple bit codes to the output stream */
    void SendBits(unsigned int bits, unsigned int numbits)
    {
    
            bitbuf |= (bits << bitsin);
    
            bitsin += numbits;
    
            if (bitsin > 16)         /* special case when # bits in buffer exceeds 16 */
            {
                    if (putc(bitbuf & 0xFF, outfile) == EOF)
                    {
                            printf("nerror writing to output file");
                            exit(EXIT_FAILURE);
                    }
                    bitbuf = bits >> (8 - (bitsin - numbits));
                    bitsin -= 8;
            }
    
            while (bitsin >= 8)
            {
                    if (putc(bitbuf & 0xFF, outfile) == EOF)
                    {
                            printf("nerror writing to output file");
                            exit(EXIT_FAILURE);
                    }
                    bitbuf >>= 8;
                    bitsin -= 8;
            }
    
    }
    
    /* reads multiple bit codes from the input stream */
    unsigned int ReadBits(unsigned int numbits)
    {
    
            register unsigned int i;
    
            i = bitbuf >> (8 - bitsin);
    
            while (numbits > bitsin)
            {
                    if ((bitbuf = getc(infile)) == EOF)
                    {
                            printf("nerror reading from input file");
                            exit(EXIT_FAILURE);
                    }
                    i |= (bitbuf << bitsin);
                    bitsin += 8;
            }
    
            bitsin -= numbits;
    
            return (i & masks[numbits]);
    
    }
    
    /* sends a match to the output stream */
    void SendMatch(unsigned int matchlen, unsigned int matchdistance)
    {
            SendBits(1, 1);
    
            SendBits(matchlen - (THRESHOLD + 1), MATCHBITS);
    
            SendBits(matchdistance, DICTBITS);
    }
    
    /* sends one character (or literal) to the output stream */
    void SendChar(unsigned int character)
    {
            SendBits(0, 1);
    
            SendBits(character, CHARBITS);
    }
    
    /* initializes the search structures needed for compression */
    void InitEncode(void)
    {
            register unsigned int i;
    
            for (i = 0; i < HASHSIZE; i++) hash[i] = NIL;
    
            nextlink[DICTSIZE] = NIL;
    
    }
    
    /* loads dictionary with characters from the input stream */
    unsigned int LoadDict(unsigned int dictpos)
    {
            register unsigned int i, j;
    
            if ((i = fread(&dict[dictpos], sizeof (char), SECTORLEN, infile)) == EOF)
            {
                    printf("nerror reading from input file");
                    exit(EXIT_FAILURE);
            }
    
            /* since the dictionary is a ring buffer, copy the characters at
                     the very start of the dictionary to the end */
            if (dictpos == 0)
            {
                    for (j = 0; j < MAXMATCH; j++) dict[j + DICTSIZE] = dict[j];
            }
    
            return i;
    }
    
    /* deletes data from the dictionary search structures */
    /* this is only done when the number of bytes to be
             compressed exceeds the dictionary's size */
    void DeleteData(unsigned int dictpos)
    {
    
            register unsigned int i, j;
    
            j = dictpos;        /* put dictpos in register for more speed */
    
            /* delete all references to the sector being deleted */
    
            for (i = 0; i < DICTSIZE; i++)
                    if ((nextlink[i] & SECTORAND) == j) nextlink[i] = NIL;
    
            for (i = 0; i < HASHSIZE; i++)
                    if ((hash[i] & SECTORAND) == j) hash[i] = NIL;
    
    }
    
    /* hash data just entered into dictionary */
    /* XOR hashing is used here, but practically any hash function will work */
    void HashData(unsigned int dictpos, unsigned int bytestodo)
    {
            register unsigned int i, j, k;
    
            if (bytestodo <= THRESHOLD)   /* not enough bytes in sector for match? */
                    for (i = 0; i < bytestodo; i++) nextlink[dictpos + i] = NIL;
            else
            {
                    /* matches can't cross sector boundries */
                    for (i = bytestodo - THRESHOLD; i < bytestodo; i++)
                            nextlink[dictpos + i] = NIL;
    
                    j = (((unsigned int)dict[dictpos]) << SHIFTBITS) ^ dict[dictpos + 1];
    
                    k = dictpos + bytestodo - THRESHOLD;  /* calculate end of sector */
    
                    for (i = dictpos; i < k; i++)
                    {
                            nextlink[i] = hash[j = (((j << SHIFTBITS) & (HASHSIZE - 1)) ^ dict[i + THRESHOLD])];
                            hash[j] = i;
                    }
            }
    }
    
    /* finds match for string at position dictpos */
    /* this search code finds the longest AND closest
             match for the string at dictpos */
    void FindMatch(unsigned int dictpos, unsigned int startlen)
    {
            register unsigned int i, j, k;
            unsigned char l;
    
            i = dictpos; matchlength = startlen; k = MAXCOMPARES;
            l = dict[dictpos + matchlength];
    
            do
            {
                    if ((i = nextlink[i]) == NIL) return;   /* get next string in list */
    
                    if (dict[i + matchlength] == l)        /* possible larger match? */
                    {
                            for (j = 0; j < MAXMATCH; j++)          /* compare strings */
                                    if (dict[dictpos + j] != dict[i + j]) break;
    
                            if (j > matchlength)  /* found larger match? */
                            {
                                    matchlength = j;
                                    matchpos = i;
                                    if (matchlength == MAXMATCH) return;  /* exit if largest possible match */
                                    l = dict[dictpos + matchlength];
                            }
                    }
            }
            while (--k);  /* keep on trying until we run out of chances */
    
    }
    
    /* finds dictionary matches for characters in current sector */
    void DictSearch(unsigned int dictpos, unsigned int bytestodo)
    {
    
            register unsigned int i, j;
    
    #if (GREEDY == 0)
    
            unsigned int matchlen1, matchpos1;
    
            /* non-greedy search loop (slow) */
    
            i = dictpos; j = bytestodo;
    
            while (j) /* loop while there are still characters left to be compressed */
            {
                    FindMatch(i, THRESHOLD);
    
                    if (matchlength > THRESHOLD)
                    {
                            matchlen1 = matchlength;
                            matchpos1 = matchpos;
    
                            for ( ; ; )
                            {
                                    FindMatch(i + 1, matchlen1);
    
                                    if (matchlength > matchlen1)
                                    {
                                            matchlen1 = matchlength;
                                            matchpos1 = matchpos;
                                            SendChar(dict[i++]);
                                            j--;
                                    }
                                    else
                                    {
                                            if (matchlen1 > j)
                                            {
                                                    matchlen1 = j;
                                                    if (matchlen1 <= THRESHOLD) { SendChar(dict[i++]); j--; break; }
                                            }
    
                                            SendMatch(matchlen1, (i - matchpos1) & (DICTSIZE - 1));
                                            i += matchlen1;
                                            j -= matchlen1;
                                            break;
                                    }
                            }
                    }
                    else
                    {
                            SendChar(dict[i++]);
                            j--;
                    }
            }
    
    #else
    
            /* greedy search loop (fast) */
    
            i = dictpos; j = bytestodo;
    
            while (j) /* loop while there are still characters left to be compressed */
            {
                    FindMatch(i, THRESHOLD);
    
                    if (matchlength > j) matchlength = j;     /* clamp matchlength */
    
                    if (matchlength > THRESHOLD)  /* valid match? */
                    {
                            SendMatch(matchlength, (i - matchpos) & (DICTSIZE - 1));
                            i += matchlength;
                            j -= matchlength;
                    }
                    else
                    {
                            SendChar(dict[i++]);
                            j--;
                    }
            }
    
    #endif
    
    }
    
    /* main encoder */
    void Encode (void)
    {
            unsigned int dictpos, deleteflag, sectorlen;
            unsigned long bytescompressed;
    
            InitEncode();
    
            dictpos = deleteflag = 0;
    
            bytescompressed = 0;
    
            while (1)
            {
                    /* delete old data from dictionary */
                    if (deleteflag) DeleteData(dictpos);
    
                    /* grab more data to compress */
                    if ((sectorlen = LoadDict(dictpos)) == 0) break;
    
                    /* hash the data */
                    HashData(dictpos, sectorlen);
    
                    /* find dictionary matches */
                    DictSearch(dictpos, sectorlen);
    
                    bytescompressed += sectorlen;
    
                    printf("r%ld", bytescompressed);
    
                    dictpos += SECTORLEN;
    
                    /* wrap back to beginning of dictionary when its full */
                    if (dictpos == DICTSIZE)
                    {
                            dictpos = 0;
                            deleteflag = 1;   /* ok to delete now */
                    }
            }
    
            /* Send EOF flag */
            SendMatch(MAXMATCH + 1, 0);
    
            /* Flush bit buffer */
            if (bitsin) SendBits(0, 8 - bitsin);
    
            return;
    }
    
    /* main decoder */
    void Decode (void)
    {
    
            register unsigned int i, j, k;
            unsigned long bytesdecompressed;
    
            i = 0;
            bytesdecompressed = 0;
    
            for ( ; ; )
            {
                    if (ReadBits(1) == 0)   /* character or match? */
                    {
                            dict[i++] = ReadBits(CHARBITS);
                            if (i == DICTSIZE)
                            {
                                    if (fwrite(&dict, sizeof (char), DICTSIZE, outfile) == EOF)
                                    {
                                            printf("nerror writing to output file");
                                            exit(EXIT_FAILURE);
                                    }
                                    i = 0;
                                    bytesdecompressed += DICTSIZE;
                                    printf("r%ld", bytesdecompressed);
                            }
                    }
                    else
                    {
                            /* get match length from input stream */
                            k = (THRESHOLD + 1) + ReadBits(MATCHBITS);
    
                            if (k == (MAXMATCH + 1))      /* Check for EOF flag */
                            {
                                    if (fwrite(&dict, sizeof (char), i, outfile) == EOF)
                                    {
                                            printf("nerror writing to output file");
                                            exit(EXIT_FAILURE);
                                    }
                                    bytesdecompressed += i;
                                    printf("r%ld", bytesdecompressed);
                                    return;
                            }
    
                            /* get match position from input stream */
                            j = ((i - ReadBits(DICTBITS)) & (DICTSIZE - 1));
    
                            if ((i + k) >= DICTSIZE)
                            {
                                    do
                                    {
                                            dict[i++] = dict[j++];
                                            j &= (DICTSIZE - 1);
                                            if (i == DICTSIZE)
                                            {
                                                    if (fwrite(&dict, sizeof (char), DICTSIZE, outfile) == EOF)
                                                    {
                                                            printf("nerror writing to output file");
                                                            exit(EXIT_FAILURE);
                                                    }
                                                    i = 0;
                                                    bytesdecompressed += DICTSIZE;
                                                    printf("r%ld", bytesdecompressed);
                                            }
                                    }
                                    while (--k);
                            }
                            else
                            {
                                    if ((j + k) >= DICTSIZE)
                                    {
                                            do
                                            {
                                                    dict[i++] = dict[j++];
                                                    j &= (DICTSIZE - 1);
                                            }
                                            while (--k);
                                    }
                                    else
                                    {
                                            do
                                            {
                                                    dict[i++] = dict[j++];
                                            }
                                            while (--k);
                                    }
                            }
                    }
            }
    
    }
    
    int main(int argc, char *argv[])
    {
            char *s;
    
            if (argc != 4)
            {
                    printf("n'prog1 e file1 file2' encodes file1 into file2.n"
                                                     "'prog1 d file2 file1' decodes file2 into file1.n");
                    return EXIT_FAILURE;
            }
            if ((s = argv[1], s[1] || strpbrk(s, "DEde") == NULL)
             || (s = argv[2], (infile  = fopen(s, "rb")) == NULL)
             || (s = argv[3], (outfile = fopen(s, "wb")) == NULL)) {
                    printf("??? %sn", s);  return EXIT_FAILURE;
            }
    
            /* allocate 4k I/O buffers for a little speed */
            setvbuf( infile, NULL, _IOFBF, 4096);
            setvbuf( outfile, NULL, _IOFBF, 4096);
    
            if (toupper(*argv[1]) == 'E')
            {
                    printf("Compressing %s to %sn", argv[2], argv[3]);
                    Encode();
            }
            else
            {
                    printf("Decompressing %s to %sn", argv[2], argv[3]);
                    Decode();
            }
    
            fclose(infile);  fclose(outfile);
    
            return EXIT_SUCCESS;
    }
    

参考文献

© 版权声明
THE END
喜欢就支持一下吧
点赞11 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容