霍夫曼編碼及檔案壓縮

阿新 • • 發佈：2021-01-22

霍夫曼編碼

哈夫曼編碼(Huffman Coding)，又稱霍夫曼編碼，是一種編碼方式，哈夫曼編碼是可變字長編碼(VLC)的一種。Huffman於1952年提出一種編碼方法，該方法完全依據字元出現概率來構造異字頭的平均長度最短的碼字，有時稱之為最佳編碼，一般就叫做Huffman編碼（有時也稱為霍夫曼編碼）----源自百度百科

以字串”this is a test“為例，生成霍夫曼編碼的步驟如下：

計算各字元的權重，這裡直接用字元出現的次數表示。

t 3
h 1
i 2
s 3
\space  3
a 1
e 1

根據各字元的權重生成霍夫曼樹

14
|---8
| 
	|---t(3)
|   |---5
|       |---i(2)
|       |---3
|           |---h(1)
|           |---2
|               |---e(1)
|               |---a(1)
|---6
|   |---\space(3)
|   |---s(3)

根據霍夫曼樹生成霍夫曼編碼

t 10
h 1110
i 110
s 01
\space  00
a 11111
e 11110

霍夫曼編碼檔案壓縮

以上得到霍夫曼編碼後可以對檔案進行壓縮，繼續以字串“this is a test"為例，步驟如下：

使用霍夫曼編碼代替字串中的字元：

10111011001001100100111110010111100110

對不滿8位的進行填充

1011101100100110010011111001011110011000

將替換填充好的程式碼放入壓縮檔案中即實現了檔案壓縮

10111011 00100110 01001111 10010111 10011000
———————— ———————— ———————— ———————— ————————
  0xbb      0x26   0x4f      0x97     0x98

經過以上壓縮，檔案由14位元組轉為了5位元組，解壓即為逆過程。

檔案壓縮和解壓具體實現

#include<stdio.h> 
#include<string.h> 

#include<stdlib.h>

typedef struct{
	unsigned char ch;
	long int weight;
	int left,right,parent;
}node;

typedef struct{
	unsigned char ch;
	char*cd;
}code;

int count;
long int len,sumBytes;
node huffmanNode[256];
node*huffmanTree;
code*huffmanCode;

int getWeight(char*filePath){
	FILE *fp=fopen(filePath,"rb");
	if(fp==NULL){
		printf("can not open file %s\n",filePath);
		return -1;
	}
	
	int i;
	count=0;
	long int flag=0;
	unsigned char ch;
	memset(huffmanNode,0,sizeof(node)*256);
	fseek(fp,0,SEEK_END);
	sumBytes=ftell(fp);
	fseek(fp,0,SEEK_SET);

	while(flag<sumBytes){
		flag++;
		ch=fgetc(fp);
		for(i=0;i<count;i++)
			if(huffmanNode[i].ch==ch)
				break;
		if(i==count){
			huffmanNode[count].ch=ch;
			huffmanNode[count].weight=1;
			count++;
		}
		else{
			huffmanNode[i].weight++;
		}
	}
	fclose(fp);
	return 0;
}

void createHuffmanTree(){
	int min1,min2;
	int x1,x2,i,j;
	
	huffmanTree=(node*)realloc(huffmanTree,(2*count-1)*sizeof(node));
	for(i=0;i<count;i++){
		huffmanTree[i].ch=huffmanNode[i].ch;
		huffmanTree[i].weight=huffmanNode[i].weight;
		huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
	}
	for(;i<2*count-1;i++){
		huffmanTree[i].ch=0;
		huffmanTree[i].weight=0;
		huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
	}
	
	for(i=count;i<2*count-1;i++){
		min1=min2=999999;
		x1=x2=0;
		for(j=0;j<i;j++){
			if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min1){
				min2=min1;x2=x1;
				min1=huffmanTree[j].weight;x1=j;
			}
			else if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min2){
				min2=huffmanTree[j].weight;x2=j;
			}
		}
		huffmanTree[x1].parent=huffmanTree[x2].parent=i;
		huffmanTree[i].left=x1;huffmanTree[i].right=x2;
		huffmanTree[i].weight=min1+min2;
	}
}

void genHuffmanCode(){
	if(huffmanTree==NULL){
		printf("huffman tree is null! can not generate huffman code\n");
		return;
	}
	
	int cur,p,start;
	char*temp=(char*)malloc(sizeof(char)*(count+1));
	temp[count]='\0';	
	huffmanCode=(code*)malloc(sizeof(code)*count);
	for(int i=0;i<count;i++){
		cur=i;
		p=huffmanTree[i].parent;
		start=count;
		while(p!=-1){
			if(huffmanTree[p].left==cur)temp[--start]='0';
			else temp[--start]='1';
			cur=p;p=huffmanTree[cur].parent;
		}
		huffmanCode[i].ch=huffmanTree[i].ch;
		huffmanCode[i].cd=(char*)malloc(sizeof(char)*(count-start+1));
		strcpy(huffmanCode[i].cd,&temp[start]);
	}
}

int compress(char*file_in,char*file_out){
	len=0;
	int i,j,k=0,sum;
	unsigned char ch,temp[264];
	
	if(getWeight(file_in)<0)return -1;
	createHuffmanTree();
	genHuffmanCode();

	FILE*fp_in=fopen(file_in,"rb");
	FILE*fp_out=fopen(file_out,"wb");
	if(fp_out==NULL){
		printf("can not create %s!\n",file_out);
		return -1;
	}
	
	long int flag=0;
	while(flag<sumBytes){
		flag++;
		ch=fgetc(fp_in);
		for(i=0;i<count;i++){
			if(huffmanCode[i].ch==ch){
				len+=strlen(huffmanCode[i].cd);
				for(j=0;j<strlen(huffmanCode[i].cd);j++)
					temp[k++]=huffmanCode[i].cd[j]-'0';
					
				while(k>=8){
					sum=0;
					for(j=0;j<8;j++)sum=sum*2+(temp[j]&0x1);
					for(j=8;j<k;j++)temp[j-8]=temp[j];
					k=j-8;
					fputc(sum,fp_out);
					fflush(fp_out);
				}
				break;
			}
		}
	}
	if(k){
		sum=0;
		for(j=0;j<k;j++)sum=sum*2+(temp[j]&0x1);
		sum=sum<<(8-k);
		fputc(sum,fp_out);
		fflush(fp_out);
	}
	fclose(fp_in);fclose(fp_out);
	printf("壓縮完畢！\n");
	return 0;
}

int unCompress(char*file_in,char*file_out){
	FILE*fp_in=fopen(file_in,"rb");
	FILE*fp_out=fopen(file_out,"wb");
	if(fp_in==NULL || fp_out==NULL){
		perror("file");
		return -1;
	}
	
	char temp[512];
	unsigned char ch;
	int i,j,k=0,tmp,flag;
	while(len>0){
		len-=8;
		ch=fgetc(fp_in);
		if(len<0)tmp=len;
		else tmp=0;
		for(i=0;i<8+tmp;i++)
			temp[k++]=((ch>>(7-i))&0x01)+'0';

		while(1){
			for(i=0;i<count;i++){
				if(k>=strlen(huffmanCode[i].cd)){
					if(strncmp(huffmanCode[i].cd,temp,strlen(huffmanCode[i].cd))==0){
						fputc(huffmanCode[i].ch,fp_out);
						fflush(fp_out);
					
						for(j=strlen(huffmanCode[i].cd);j<k;j++)
							temp[j-strlen(huffmanCode[i].cd)]=temp[j];
						k-=strlen(huffmanCode[i].cd);
						break;
					}
				}
			}
			if(i==count)break;
		}
	}
	fclose(fp_in);fclose(fp_out);
	printf("解壓完畢！\n");
	return 0;
}

int main(){
	int i;
	compress("test.txt","test.txt.myZip");
	unCompress("test.txt.myZip","test_uzip.txt");
	return 0;
}