1. 程式人生 > 其它 >霍夫曼編碼及檔案壓縮

霍夫曼編碼及檔案壓縮

技術標籤:雜記演算法

霍夫曼編碼

哈夫曼編碼(Huffman Coding),又稱霍夫曼編碼,是一種編碼方式,哈夫曼編碼是可變字長編碼(VLC)的一種。Huffman於1952年提出一種編碼方法,該方法完全依據字元出現概率來構造異字頭的平均長度最短的碼字,有時稱之為最佳編碼,一般就叫做Huffman編碼(有時也稱為霍夫曼編碼)----源自百度百科

以字串”this is a test“為例,生成霍夫曼編碼的步驟如下:

  1. 計算各字元的權重,這裡直接用字元出現的次數表示。
t 3
h 1
i 2
s 3
\space  3
a 1
e 1
  1. 根據各字元的權重生成霍夫曼樹
14
|---8
|
|---t(3) | |---5 | |---i(2) | |---3 | |---h(1) | |---2 | |---e(1) | |---a(1) |---6 | |---\space(3) | |---s(3)
  1. 根據霍夫曼樹生成霍夫曼編碼
t 10
h 1110
i 110
s 01
\space  00
a 11111
e 11110

霍夫曼編碼檔案壓縮

以上得到霍夫曼編碼後可以對檔案進行壓縮,繼續以字串“this is a test"為例,步驟如下:

  1. 使用霍夫曼編碼代替字串中的字元:
10111011001001100100111110010111100110
  1. 對不滿8位的進行填充
1011101100100110010011111001011110011000
  1. 將替換填充好的程式碼放入壓縮檔案中即實現了檔案壓縮
10111011 00100110 01001111 10010111 10011000
———————— ———————— ———————— ———————— ————————
  0xbb      0x26   0x4f      0x97     0x98

經過以上壓縮,檔案由14位元組轉為了5位元組,解壓即為逆過程。

檔案壓縮和解壓具體實現

#include<stdio.h> 
#include<string.h>
#include<stdlib.h> typedef struct{ unsigned char ch; long int weight; int left,right,parent; }node; typedef struct{ unsigned char ch; char*cd; }code; int count; long int len,sumBytes; node huffmanNode[256]; node*huffmanTree; code*huffmanCode; int getWeight(char*filePath){ FILE *fp=fopen(filePath,"rb"); if(fp==NULL){ printf("can not open file %s\n",filePath); return -1; } int i; count=0; long int flag=0; unsigned char ch; memset(huffmanNode,0,sizeof(node)*256); fseek(fp,0,SEEK_END); sumBytes=ftell(fp); fseek(fp,0,SEEK_SET); while(flag<sumBytes){ flag++; ch=fgetc(fp); for(i=0;i<count;i++) if(huffmanNode[i].ch==ch) break; if(i==count){ huffmanNode[count].ch=ch; huffmanNode[count].weight=1; count++; } else{ huffmanNode[i].weight++; } } fclose(fp); return 0; } void createHuffmanTree(){ int min1,min2; int x1,x2,i,j; huffmanTree=(node*)realloc(huffmanTree,(2*count-1)*sizeof(node)); for(i=0;i<count;i++){ huffmanTree[i].ch=huffmanNode[i].ch; huffmanTree[i].weight=huffmanNode[i].weight; huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1; } for(;i<2*count-1;i++){ huffmanTree[i].ch=0; huffmanTree[i].weight=0; huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1; } for(i=count;i<2*count-1;i++){ min1=min2=999999; x1=x2=0; for(j=0;j<i;j++){ if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min1){ min2=min1;x2=x1; min1=huffmanTree[j].weight;x1=j; } else if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min2){ min2=huffmanTree[j].weight;x2=j; } } huffmanTree[x1].parent=huffmanTree[x2].parent=i; huffmanTree[i].left=x1;huffmanTree[i].right=x2; huffmanTree[i].weight=min1+min2; } } void genHuffmanCode(){ if(huffmanTree==NULL){ printf("huffman tree is null! can not generate huffman code\n"); return; } int cur,p,start; char*temp=(char*)malloc(sizeof(char)*(count+1)); temp[count]='\0'; huffmanCode=(code*)malloc(sizeof(code)*count); for(int i=0;i<count;i++){ cur=i; p=huffmanTree[i].parent; start=count; while(p!=-1){ if(huffmanTree[p].left==cur)temp[--start]='0'; else temp[--start]='1'; cur=p;p=huffmanTree[cur].parent; } huffmanCode[i].ch=huffmanTree[i].ch; huffmanCode[i].cd=(char*)malloc(sizeof(char)*(count-start+1)); strcpy(huffmanCode[i].cd,&temp[start]); } } int compress(char*file_in,char*file_out){ len=0; int i,j,k=0,sum; unsigned char ch,temp[264]; if(getWeight(file_in)<0)return -1; createHuffmanTree(); genHuffmanCode(); FILE*fp_in=fopen(file_in,"rb"); FILE*fp_out=fopen(file_out,"wb"); if(fp_out==NULL){ printf("can not create %s!\n",file_out); return -1; } long int flag=0; while(flag<sumBytes){ flag++; ch=fgetc(fp_in); for(i=0;i<count;i++){ if(huffmanCode[i].ch==ch){ len+=strlen(huffmanCode[i].cd); for(j=0;j<strlen(huffmanCode[i].cd);j++) temp[k++]=huffmanCode[i].cd[j]-'0'; while(k>=8){ sum=0; for(j=0;j<8;j++)sum=sum*2+(temp[j]&0x1); for(j=8;j<k;j++)temp[j-8]=temp[j]; k=j-8; fputc(sum,fp_out); fflush(fp_out); } break; } } } if(k){ sum=0; for(j=0;j<k;j++)sum=sum*2+(temp[j]&0x1); sum=sum<<(8-k); fputc(sum,fp_out); fflush(fp_out); } fclose(fp_in);fclose(fp_out); printf("壓縮完畢!\n"); return 0; } int unCompress(char*file_in,char*file_out){ FILE*fp_in=fopen(file_in,"rb"); FILE*fp_out=fopen(file_out,"wb"); if(fp_in==NULL || fp_out==NULL){ perror("file"); return -1; } char temp[512]; unsigned char ch; int i,j,k=0,tmp,flag; while(len>0){ len-=8; ch=fgetc(fp_in); if(len<0)tmp=len; else tmp=0; for(i=0;i<8+tmp;i++) temp[k++]=((ch>>(7-i))&0x01)+'0'; while(1){ for(i=0;i<count;i++){ if(k>=strlen(huffmanCode[i].cd)){ if(strncmp(huffmanCode[i].cd,temp,strlen(huffmanCode[i].cd))==0){ fputc(huffmanCode[i].ch,fp_out); fflush(fp_out); for(j=strlen(huffmanCode[i].cd);j<k;j++) temp[j-strlen(huffmanCode[i].cd)]=temp[j]; k-=strlen(huffmanCode[i].cd); break; } } } if(i==count)break; } } fclose(fp_in);fclose(fp_out); printf("解壓完畢!\n"); return 0; } int main(){ int i; compress("test.txt","test.txt.myZip"); unCompress("test.txt.myZip","test_uzip.txt"); return 0; }

程式碼效果:
在這裡插入圖片描述

壓縮演算法目前可用於壓縮txt,jpg,pdf等各類檔案,但是對除txt外的檔案的壓縮效果不好,且壓縮檔案存在大小限制,估計不超過2M,僅比較適用於txt文件的壓縮,有很大優化空間。