霍夫曼編碼及檔案壓縮
阿新 • • 發佈:2021-01-22
霍夫曼編碼
哈夫曼編碼(Huffman Coding),又稱霍夫曼編碼,是一種編碼方式,哈夫曼編碼是可變字長編碼(VLC)的一種。Huffman於1952年提出一種編碼方法,該方法完全依據字元出現概率來構造異字頭的平均長度最短的碼字,有時稱之為最佳編碼,一般就叫做Huffman編碼(有時也稱為霍夫曼編碼)----源自百度百科
以字串”this is a test“為例,生成霍夫曼編碼的步驟如下:
- 計算各字元的權重,這裡直接用字元出現的次數表示。
t 3
h 1
i 2
s 3
\space 3
a 1
e 1
- 根據各字元的權重生成霍夫曼樹
14
|---8
| |---t(3)
| |---5
| |---i(2)
| |---3
| |---h(1)
| |---2
| |---e(1)
| |---a(1)
|---6
| |---\space(3)
| |---s(3)
- 根據霍夫曼樹生成霍夫曼編碼
t 10
h 1110
i 110
s 01
\space 00
a 11111
e 11110
霍夫曼編碼檔案壓縮
以上得到霍夫曼編碼後可以對檔案進行壓縮,繼續以字串“this is a test"為例,步驟如下:
- 使用霍夫曼編碼代替字串中的字元:
10111011001001100100111110010111100110
- 對不滿8位的進行填充
1011101100100110010011111001011110011000
- 將替換填充好的程式碼放入壓縮檔案中即實現了檔案壓縮
10111011 00100110 01001111 10010111 10011000
———————— ———————— ———————— ———————— ————————
0xbb 0x26 0x4f 0x97 0x98
經過以上壓縮,檔案由14位元組轉為了5位元組,解壓即為逆過程。
檔案壓縮和解壓具體實現
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
typedef struct{
unsigned char ch;
long int weight;
int left,right,parent;
}node;
typedef struct{
unsigned char ch;
char*cd;
}code;
int count;
long int len,sumBytes;
node huffmanNode[256];
node*huffmanTree;
code*huffmanCode;
int getWeight(char*filePath){
FILE *fp=fopen(filePath,"rb");
if(fp==NULL){
printf("can not open file %s\n",filePath);
return -1;
}
int i;
count=0;
long int flag=0;
unsigned char ch;
memset(huffmanNode,0,sizeof(node)*256);
fseek(fp,0,SEEK_END);
sumBytes=ftell(fp);
fseek(fp,0,SEEK_SET);
while(flag<sumBytes){
flag++;
ch=fgetc(fp);
for(i=0;i<count;i++)
if(huffmanNode[i].ch==ch)
break;
if(i==count){
huffmanNode[count].ch=ch;
huffmanNode[count].weight=1;
count++;
}
else{
huffmanNode[i].weight++;
}
}
fclose(fp);
return 0;
}
void createHuffmanTree(){
int min1,min2;
int x1,x2,i,j;
huffmanTree=(node*)realloc(huffmanTree,(2*count-1)*sizeof(node));
for(i=0;i<count;i++){
huffmanTree[i].ch=huffmanNode[i].ch;
huffmanTree[i].weight=huffmanNode[i].weight;
huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
}
for(;i<2*count-1;i++){
huffmanTree[i].ch=0;
huffmanTree[i].weight=0;
huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
}
for(i=count;i<2*count-1;i++){
min1=min2=999999;
x1=x2=0;
for(j=0;j<i;j++){
if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min1){
min2=min1;x2=x1;
min1=huffmanTree[j].weight;x1=j;
}
else if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min2){
min2=huffmanTree[j].weight;x2=j;
}
}
huffmanTree[x1].parent=huffmanTree[x2].parent=i;
huffmanTree[i].left=x1;huffmanTree[i].right=x2;
huffmanTree[i].weight=min1+min2;
}
}
void genHuffmanCode(){
if(huffmanTree==NULL){
printf("huffman tree is null! can not generate huffman code\n");
return;
}
int cur,p,start;
char*temp=(char*)malloc(sizeof(char)*(count+1));
temp[count]='\0';
huffmanCode=(code*)malloc(sizeof(code)*count);
for(int i=0;i<count;i++){
cur=i;
p=huffmanTree[i].parent;
start=count;
while(p!=-1){
if(huffmanTree[p].left==cur)temp[--start]='0';
else temp[--start]='1';
cur=p;p=huffmanTree[cur].parent;
}
huffmanCode[i].ch=huffmanTree[i].ch;
huffmanCode[i].cd=(char*)malloc(sizeof(char)*(count-start+1));
strcpy(huffmanCode[i].cd,&temp[start]);
}
}
int compress(char*file_in,char*file_out){
len=0;
int i,j,k=0,sum;
unsigned char ch,temp[264];
if(getWeight(file_in)<0)return -1;
createHuffmanTree();
genHuffmanCode();
FILE*fp_in=fopen(file_in,"rb");
FILE*fp_out=fopen(file_out,"wb");
if(fp_out==NULL){
printf("can not create %s!\n",file_out);
return -1;
}
long int flag=0;
while(flag<sumBytes){
flag++;
ch=fgetc(fp_in);
for(i=0;i<count;i++){
if(huffmanCode[i].ch==ch){
len+=strlen(huffmanCode[i].cd);
for(j=0;j<strlen(huffmanCode[i].cd);j++)
temp[k++]=huffmanCode[i].cd[j]-'0';
while(k>=8){
sum=0;
for(j=0;j<8;j++)sum=sum*2+(temp[j]&0x1);
for(j=8;j<k;j++)temp[j-8]=temp[j];
k=j-8;
fputc(sum,fp_out);
fflush(fp_out);
}
break;
}
}
}
if(k){
sum=0;
for(j=0;j<k;j++)sum=sum*2+(temp[j]&0x1);
sum=sum<<(8-k);
fputc(sum,fp_out);
fflush(fp_out);
}
fclose(fp_in);fclose(fp_out);
printf("壓縮完畢!\n");
return 0;
}
int unCompress(char*file_in,char*file_out){
FILE*fp_in=fopen(file_in,"rb");
FILE*fp_out=fopen(file_out,"wb");
if(fp_in==NULL || fp_out==NULL){
perror("file");
return -1;
}
char temp[512];
unsigned char ch;
int i,j,k=0,tmp,flag;
while(len>0){
len-=8;
ch=fgetc(fp_in);
if(len<0)tmp=len;
else tmp=0;
for(i=0;i<8+tmp;i++)
temp[k++]=((ch>>(7-i))&0x01)+'0';
while(1){
for(i=0;i<count;i++){
if(k>=strlen(huffmanCode[i].cd)){
if(strncmp(huffmanCode[i].cd,temp,strlen(huffmanCode[i].cd))==0){
fputc(huffmanCode[i].ch,fp_out);
fflush(fp_out);
for(j=strlen(huffmanCode[i].cd);j<k;j++)
temp[j-strlen(huffmanCode[i].cd)]=temp[j];
k-=strlen(huffmanCode[i].cd);
break;
}
}
}
if(i==count)break;
}
}
fclose(fp_in);fclose(fp_out);
printf("解壓完畢!\n");
return 0;
}
int main(){
int i;
compress("test.txt","test.txt.myZip");
unCompress("test.txt.myZip","test_uzip.txt");
return 0;
}
程式碼效果:
壓縮演算法目前可用於壓縮txt,jpg,pdf等各類檔案,但是對除txt外的檔案的壓縮效果不好,且壓縮檔案存在大小限制,估計不超過2M,僅比較適用於txt文件的壓縮,有很大優化空間。