Huffman編碼實現壓縮、解壓檔案
阿新 • • 發佈:2018-12-30
Huffman編碼:根據詞頻構建Huffman樹,實現對文字的字首編碼。
1、統計文字中每個字元出現的次數,放入優先佇列中,構建一棵空的二叉樹;
2、取出頻率最小的兩個字元a、b,字元a、b的頻率分別作為此二叉樹的左右結點,左結點的編號為1,右結點的編號為0,其頻率之和(fa + fb)作為該二叉樹的父親節點,放入優先佇列,並將fa 、fb 從優先佇列中除去;
3、重複第二步操作,直至優先佇列中只剩下一個數,即為此Huffman樹的根節點。
4、從根節點到每個葉節點(文字中出現的字元)的“路徑”,即0、1序列串就是該字元的字首編碼。
注:這種編碼方式保證了,任意一個字元的編碼都不會是其他字元編碼的字首,這樣在解碼過程中就不會混淆。
資料結構:
為方便記錄每個字元的字首編碼,在構建Huffman樹過程中,需要儲存每一個結點的父親節點、左右兒子結點、葉節點對應字元、當前結點頻率。
壓縮過程:
1、首先構建Huffman樹,獲得每個字元對應的字首編碼;
2、將字元及其對應的字首編碼等壓縮資訊寫入壓縮文件中,便於解碼;
3、掃描文字,將文字中的字元轉換成0、1串,每八位,即一個位元組對應的字元儲存到壓縮檔案中。
注:如果最後儲存的0、1串不足八位,則在末尾補0,然後將補的位數資訊寫入壓縮檔案中。
解壓過程:
1、讀取壓縮資訊;
2、掃描壓縮文字,將每個字元轉化成0、1串,匹配字元的字首編碼,轉化成原始檔案。
注:解碼時需刪除之前補充的位數
一點體會:
1、總在迴圈內,動態申請陣列,會導致程式崩潰;
2、千萬不要在迴圈內,每次都呼叫strlen函式,我表示沒能深入瞭解此函式內涵,導致程式慢的要死;
3、原文字越大,壓縮率越高,對於一個2M的檔案,壓縮率大約在45%左右;
4、感謝領導傾情指點,比賽加油!
壓縮過程程式原始碼:
解壓過程程式原始碼:#include <iostream> #include <fstream> #include <cstring> #include <queue> #include <algorithm> #include <time.h> using namespace std; typedef long long LL; const int FILE_LENGTH = 1000; //maximal bytes which is read from file each time const long long MAX_MEMORY = 3 * 1024 * 1024; //number of kinds of character const int KIND_OF_CHARACTER = 260; //the maximal length of Huffman code const int HUFFMAN_CODE_LENGTH = 1000; //the position of the size of original file in compressed file const int OFFSET = 20; //store compress file in 8 bits const int nBits = 8; struct Node { char c; // character int parent, lChild, rChild;//children node int iNode; //the serial number of node LL number; //number of corresponding character friend bool operator < (Node a, Node b) { return a.number > b.number; } }node[KIND_OF_CHARACTER]; char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH]; //LL characters[KIND_OF_CHARACTER]; void CountKinds(); //for test int BuildHuffmanTree(); void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode); void BitToInt(ofstream &outPut, char *HTstr, LL len); int main() { //scan the file to count frequency of each character. char filePath[FILE_LENGTH] = "graph.txt"; //"Aesop_Fables.txt"; "graph.txt"; "1.txt"; char compressFilePath[FILE_LENGTH] = "result.txt"; ifstream readIn; readIn.open(filePath, ios::binary); if (readIn.is_open() == 0) { cout << "OPEN FAILED!" << endl; exit(0); } //get size of file readIn.seekg(0, ios::end); LL fileSize = (LL)readIn.tellg(); readIn.seekg(0, ios::beg); cout<<"fileSize" <<fileSize<<endl; //read data in batches, each time read MAX_MEMORY characters int nTimes = (int)(fileSize / MAX_MEMORY); if (fileSize % MAX_MEMORY != 0) nTimes++; int kindsOfCharacter = 0; cout<<nTimes<<endl; for (int i = 1; i <= nTimes; i++) { char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char)); LL numberOfCharacter = MAX_MEMORY; if (i == nTimes) { numberOfCharacter = fileSize % MAX_MEMORY; } readIn.read(str, numberOfCharacter * sizeof(char)); str[numberOfCharacter] = '\0'; cout<<strlen(str)<<endl; //count the frequency of each character. int lenStr = strlen(str); for (LL j = 0; j < lenStr; j++) { node[str[j]].number++; node[str[j]].c = str[j]; } free(str); } // CountKinds(); //build Huffman tree int numberOfNode = BuildHuffmanTree(); //compress file using Huffman code CompressFile(filePath, compressFilePath, numberOfNode); //outPut.close(); // readIn.close(); } int BuildHuffmanTree(){ //apply 2 * KIND_OF_CHARACTER to store nodes of the Huffman tree Node* HT = (Node *)malloc((2 * KIND_OF_CHARACTER) * sizeof(Node)); //put all kinds of character into priority queue priority_queue<Node> q; int numberOfNode = 0; for (int i = 0; i < KIND_OF_CHARACTER; i++) { if (node[i].number != 0) { node[i].iNode = numberOfNode; node[i].c = i; q.push(node[i]); HT[numberOfNode] = node[i]; numberOfNode++; } } cout << numberOfNode << endl; int jNode = numberOfNode; while (q.size() > 1){ //get two minimal weight nodes and set their parent Node leftNode = q.top(); q.pop(); Node rightNode = q.top(); q.pop(); //cout <<" ##"<< leftNode.number <<endl; //cout <<" **"<< rightNode.number <<endl; int l = leftNode.iNode; int r = rightNode.iNode; HT[l].parent = jNode; HT[r].parent = jNode; //set parent's information HT[jNode].c = ' '; HT[jNode].iNode = jNode; HT[jNode].lChild = l; HT[jNode].rChild = r; HT[jNode].number = leftNode.number + rightNode.number; q.push(HT[jNode]); jNode++; } HT[jNode-1].parent = -1; /* for (int i = 0; i < jNode; i++){ cout << i << " " << HT[i].c << " " << HT[i].number<< endl; }*/ //get each character's Huffman code for (int i = 0; i < numberOfNode; i++) { int k = 0; int l = i; char ch = HT[i].c; for (int j = HT[i].parent; j != -1; j = HT[j].parent) { if (HT[j].lChild == l) { HuffmanCode[ch][k] = '0'; } else { HuffmanCode[ch][k] = '1'; } l = j; k++; } //reverse the Huffman code for (int j = 0; j < k / 2; j++) { char temp = HuffmanCode[ch][j]; HuffmanCode[ch][j] = HuffmanCode[ch][k-1-j]; HuffmanCode[ch][k-1-j] = temp; } HuffmanCode[ch][k] = '\0'; cout << ch << " " <<HuffmanCode[ch] << endl; } cout<<numberOfNode<<endl; free(HT); return numberOfNode; } void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode){ //scan characters in input file once more ifstream readIn; readIn.open(filePath, ios::binary); if (readIn.is_open() == 0) { cout << "OPEN FAILED!" << endl; exit(0); } //write Huffman code file //Information: number of bits added, OFFSET, size of original file. the number of kinds of character ofstream outPut; outPut.open(outPutFilePath, ios::binary); if (outPut.is_open() == 0) { cout << "OPEN FAILED!" << endl; exit(0); } //get size of file readIn.seekg(0, ios::end); LL fileSize = (LL)readIn.tellg(); readIn.seekg(0, ios::beg); //write some information in compressed file outPut.seekp(OFFSET, ios::beg); outPut.write((char *)&fileSize, sizeof(LL)); outPut.write((char *)&numberOfNode, sizeof(int)); //record the character and its Huffman code for (int i = 0; i < KIND_OF_CHARACTER; i++) { if (node[i].number != 0) { outPut.write((char *)&i, sizeof(char)); int bits = strlen(HuffmanCode[i]); outPut.write((char *)&bits, sizeof(int)); outPut.write((char *)&HuffmanCode[i], bits*sizeof(char)); } } //read data in batches, each time read MAX_MEMORY characters and encode int nTimes = (int)(fileSize / MAX_MEMORY); if (fileSize % MAX_MEMORY != 0) nTimes++; int kindsOfCharacter = 0; char *HTstr = (char *)calloc(1, (MAX_MEMORY+HUFFMAN_CODE_LENGTH)*sizeof(char)); int len = 0; LL lenT = 0; for (int i = 1; i <= nTimes; i++) { char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char)); LL numberOfCharacter = MAX_MEMORY; if (i == nTimes) { numberOfCharacter = fileSize % MAX_MEMORY; } readIn.read(str, numberOfCharacter * sizeof(char)); str[numberOfCharacter] = '\0'; for (LL j = 0; j < numberOfCharacter; j++) { char ch = str[j]; lenT += strlen(HuffmanCode[ch]); strcpy(HTstr+len, HuffmanCode[ch]); len += strlen(HuffmanCode[ch]); //write compressed file in batches //when the length of encode string is greater than limited memory if (len > MAX_MEMORY) { // cout<<"****"<<endl; LL leftBits = len % nBits; LL changeLength = len - leftBits; BitToInt(outPut, HTstr, changeLength); //if no left bits, no need to keep it. strcpy(HTstr, HTstr+changeLength); len = strlen(HTstr); } } free(str); } //cout<<strlen(HTstr)<<" "<<HTstr<<endl; //if there are left bits, change int integer if (len != 0) { BitToInt(outPut, HTstr, len); //store tail??? } free(HTstr); readIn.close(); outPut.close(); } void BitToInt(ofstream &outPut, char* HTstr, LL len) { //add 0 to make the length of HTstr can be divide by 7 int k = 0; if (len % nBits != 0) { int bitsToAdd = nBits - (len % nBits); streampos pos = outPut.tellp(); outPut.seekp(0, ios::beg); outPut.write((char *)&bitsToAdd, sizeof(int)); outPut.write((char *)&OFFSET, sizeof(int)); outPut.seekp(pos, ios::beg); for (; k < bitsToAdd; k++){ HTstr[len+k] = '0'; } HTstr[len+k] = '\0'; } //char *buf = (char *)calloc(1, MAX_MEMORY * sizeof(char)); //convert bit to char int pow = 1<<(nBits - 1); int sum = 0; for (LL i = 0, j = 0; i < len+k && HTstr[i]; i++) { if (j == nBits){ outPut.write((char *)&sum, sizeof(char)); j = 0; sum = 0; } sum = sum + (HTstr[i]-'0') * (pow >> j); j++; } // outPut.write(buf, strlen(buf) * sizeof(char)); outPut.write((char *)&sum, sizeof(char)); // free(buf); // cout <<sum <<endl; } void CountKinds(){ int kinds = 0; for (int i = 0; i < KIND_OF_CHARACTER; i++) { if (node[i].number != 0) { printf("%c ", node[i].c); cout << node[i].c << " " << node[i].number<<endl; kinds++; } } cout << kinds << endl; //76 }
#include <iostream>
#include <fstream>
#include <algorithm>
#include <cstring>
using namespace std;
typedef long long LL;
const int FILE_LENGTH = 1000;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//number of kinds of character
const int KIND_OF_CHARACTER = 256;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 1 * 1024 * 1024;
struct Node {
char c; //character
char Huffmancode[HUFFMAN_CODE_LENGTH]; //bits string
}node[KIND_OF_CHARACTER]; //encoding information
//store each nBits
int nBits = 8;
LL originalFileSize; //the size of original file
int numberOfNode; //number of kind of character
int bitsAdded;
int OFFSET;
int GetCompressInformation(ifstream &readIn);
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength);
int main() {
char compressFilePath[FILE_LENGTH] = "result.txt"; //graph.txt "1.txt";
char decompressFilePath[FILE_LENGTH] = "decompressResult.txt";
ifstream readIn;
readIn.open(compressFilePath, ios::binary);
if (readIn.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
ofstream writeOut;
writeOut.open(decompressFilePath, ios::binary);
if (writeOut.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
//get information of compressed file
int maxEncodingLength = GetCompressInformation(readIn);
//decompress File
DecompressFile(readIn, writeOut, maxEncodingLength);
readIn.close();
writeOut.close();
return 0;
}
int GetCompressInformation(ifstream &readIn){
readIn.read((char *)&bitsAdded, sizeof(int));
readIn.read((char *)&OFFSET, sizeof(int));
readIn.seekg(OFFSET, ios::beg);
readIn.read((char *)&originalFileSize, sizeof(LL));
readIn.read((char *)&numberOfNode, sizeof(int));
cout << originalFileSize << " " << numberOfNode << endl;
//record the character and its Huffman code
int maxEncodingLength = 0;
for (int i = 0; i < numberOfNode; i++) {
readIn.read((char *)&node[i].c, sizeof(char));
int bits;
readIn.read((char *)&bits, sizeof(int));
readIn.read((char *)&node[i].Huffmancode, bits*sizeof(char));
node[i].Huffmancode[bits] = '\0';
cout << node[i].c << " " << node[i].Huffmancode << endl;
if (maxEncodingLength < strlen(node[i].Huffmancode)) {
maxEncodingLength = strlen(node[i].Huffmancode);
}
}
cout << " maxEncodingLength :" << maxEncodingLength << endl;
return maxEncodingLength;
}
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength){
//get size of compressed file
streampos curPos = readIn.tellg();
readIn.seekg(0, ios::end);
LL compressedFileSize = (LL)(readIn.tellg() - curPos);
readIn.seekg(curPos, ios::beg);
cout << "size of compressed file : " << compressedFileSize << endl;
//read data in batches, each time read MAX_MEMORY characters
int nTimes = (int)(compressedFileSize / MAX_MEMORY);
if (compressedFileSize % MAX_MEMORY != 0) nTimes++;
char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
int lenOfChar = 0;
for (int j = 1; j <= nTimes; j++) {
LL numberOfCharacter = MAX_MEMORY;
if (j == nTimes) {
numberOfCharacter = compressedFileSize % MAX_MEMORY;
}
char *strTemp = (char *)calloc(1, (2*HUFFMAN_CODE_LENGTH) * sizeof(char));
char *buf = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
readIn.read(buf, numberOfCharacter * sizeof(char));
//cout<<buf<<endl;
//printf("%d\n", ascII);
int lenOfStrTemp = 0;
for (int k = 0; k < numberOfCharacter; k++) {
// convert it to binary bits
unsigned char ascII = buf[k];
char huffmanString[3*nBits];
for (int i = nBits - 1; i >= 0; i--) {
huffmanString[i] = ascII % 2 + '0';
ascII = ascII / 2;
}
//if read last character, then minus bits which is added
if ((j == nTimes) && (k == numberOfCharacter - 1)) {
// printf("ascII:%d\n", ascII);
nBits = nBits - bitsAdded;
}
huffmanString[nBits] = '\0';
// cout<<huffmanString<<endl;
strcpy(strTemp + lenOfStrTemp, huffmanString);
lenOfStrTemp += strlen(huffmanString);
//convert bit to char
LL comparePosition = 0;
while (1) {
bool flag = false;
for (int z = 0; z < numberOfNode; z++) {
//if(strlen(node[z].Huffmancode) > strlen(strcmp)) continue;
int lenHuffmanCode = strlen(node[z].Huffmancode);
if (!memcmp(node[z].Huffmancode, strTemp, lenHuffmanCode)) {
str[lenOfChar] = node[z].c;
str[lenOfChar+1] = '\0';
lenOfChar ++;
//cout<<"strTempF:"<<strTemp<<endl;
strcpy(strTemp, strTemp+lenHuffmanCode);
lenOfStrTemp = strlen(strTemp);
//cout<<"strTemp:"<<strTemp<<endl;
flag = true;
break;
//comparePosition += lenHuffmanCode;
}
}
if (!flag || (lenOfStrTemp == 0)) break;
}
//if length of str is larger than limited memory, write into decompressed file
if (lenOfChar > MAX_MEMORY) {
writeOut.write(str, lenOfChar * sizeof(char));
//apply a new memory will result in crash
//free(str);
//char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
strcpy(str, "");
lenOfChar = 0;
}
}
free(buf);
free(strTemp);
}
//cout<<str<<endl;
if (lenOfChar != 0){
writeOut.write(str, lenOfChar * sizeof(char));
free(str);
}
}