基於哈夫曼樹的檔案壓縮
阿新 • • 發佈:2019-01-07
基本思想:
壓縮:
1、統計出檔案中相同字元出現的次數
2、獲取哈夫曼編碼
次數作為權值構建哈夫曼樹
3、重新編碼,寫回壓縮檔案
儲存標頭檔案:
原始檔字尾
編碼資訊的行數
每個字元的權
儲存編碼
解壓縮:
1、獲取原檔案字尾
2、獲取每個字元出現的次數,即權值
3、利用之前後的的權值,還原哈夫曼樹
4、找到對應的葉子節點,將資訊儲存到解壓檔案中
在寫壓縮檔案之前,首先需要實現堆和哈夫曼樹
1,建堆
#include<iostream>
#include<vector>
using namespace std ;
//利用仿函式的特性實現程式碼的複用性
template<class T>
struct Small
{
bool operator()(const T& l, const T& r)
{
return l < r;
}
};
template<class T>
struct Large
{
bool operator()(const T& l, const T& r)
{
return l > r;
}
};
template<class T, class Compare = Large<T>> //預設是建小堆
class Heap
{
public:
Heap()
{}
Heap(const T *a, int size)
{
assert(a);
_a.reserve(size);
for (int i = 0; i<size; ++i)
{
_a.push_back(a[i]);
}
//建堆的時候從倒數第一個非葉子結點開始.
for (int j = (size - 2) / 2; j >= 0; --j)
{
adjust_down(j);
}
}
void Push(const T& x)
{
_a.push_back(x);
adjust_up(_a.size() - 1);
}
void Pop()
{
assert(!_a.empty());
swap(_a[0], _a[_a.size() - 1]);
_a.pop_back();
adjust_down(0);
}
size_t Size()
{
return _a.size();
}
bool Empty()
{
return _a.empty();
}
const T& Top()const
{
assert(!_a.empty());
return _a[0];
}
void Display()
{
for (size_t i = 0; i<_a.size(); ++i)
{
cout << _a[i] << " ";
}
cout << endl;
}
void adjust_down(int root)
{
Compare com;
int parent = root;
int child = parent * 2 + 1;//parent的左孩子
while (child < _a.size())
{
/*if rightchild > leftchild,child->right
while 裡面我們已經可以確定child(左孩子下標一定小於size
但是我們不能保證右孩子的下標小於size,所以if語句裡我們
要判斷一下,以免訪問越界)
*/
if (child + 1<_a.size() && com(_a[child + 1], _a[child]))
//if (child + 1<_a.size() && _a[child + 1] > _a[child])
{
++child;
}
if (com(_a[child], _a[parent]))//如果是>則為大堆
//if (_a[child] > _a[parent])//if child>parent,swap
{
swap(_a[child], _a[parent]);
parent = child; //讓parent指向child,繼續向下調整
child = child * 2 + 1;
}
else
break;
}
}
void adjust_up(int child)
{
Compare com;
size_t parent = (child - 1) >> 1;
while (child > 0)
{
if (com(_a[child], _a[parent]))
// if (_a[child] > _a[parent])
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) >> 1;
}
break;
}
}
protected:
vector<T> _a;
};
2,構建哈弗曼樹
#include "heap.h"
template<class T>
struct HuffmanTreeNode
{
T _weight;
HuffmanTreeNode<T> *_left;
HuffmanTreeNode<T> *_right;
HuffmanTreeNode<T> *_parent;
HuffmanTreeNode(const T& w = T())
:_weight(w) //權值
, _left(NULL)
, _right(NULL)
, _parent(NULL)
{}
};
template<class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
public:
HuffmanTree()
:_root(NULL)
{}
HuffmanTree(const T* a, size_t size)
:_root(NULL)
{
//定義一個內部類
struct NodeLess
{
bool operator()(Node *l, Node *r)const
{
return l->_weight < r->_weight;
}
};
Heap<Node *, NodeLess> minHeap;
//建立結點並放入vector中
for (size_t i = 0; i<size; ++i)
{
Node *tmp = new Node(a[i]);
minHeap.Push(tmp);
}
//取出較小的兩個結點作為左右孩子並構建父結點
while (minHeap.Size() > 1)
{
Node *left = minHeap.Top();
minHeap.Pop();
Node *right = minHeap.Top();
minHeap.Pop();
Node *parent = new Node(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
left-> = p_parentarent;
right->_parent = parent;
minHeap.Push(parent);
}
_root = minHeap.Top();
}
HuffmanTree(const T* a, size_t size, const T& invalid)
{
struct NodeLess
{
bool operator()(Node *l, Node *r)const
{
return l->_weight < r->_weight;
}
};
Heap<Node *, NodeLess> minHeap;
//建立結點並放入vector中
for (size_t i = 0; i<size; ++i)
{
if (a[i] != invalid)
{
Node *tmp = new Node(a[i]);
minHeap.Push(tmp);
}
}
//取出較小的兩個結點作為左右孩子並構建父結點
while (minHeap.Size() > 1)
{
Node *left = minHeap.Top();
minHeap.Pop();
Node *right = minHeap.Top();
minHeap.Pop();
Node *parent = new Node(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
left->_parent = parent;
right->_parent = parent;
minHeap.Push(parent);
}
_root = minHeap.Top();
}
Node* GetRoot()
{
return _root;
}
void Destroy(Node* &root)
{
if (root == NULL)
return;
Destroy(root->_left);
Destroy(root->_rihgt);
delete root;
root = NULL:
return;
}
protected:
Node *_root;
};
3,生產哈夫曼編碼,並進行壓縮和解壓縮
#include<string>
#include<Windows.h>
#include<assert.h>
#include "huffman_tree.h"
using namespace std;
typedef long long Type;
struct CharInfo
{
unsigned char _ch; //出現的字元
Type _count; //統計次數
string _code; //Huffman編碼
CharInfo(Type count = 0)
:_ch(0)
, _count(count)
, _code("")
{}
//過載對應的操作符
CharInfo operator + (const CharInfo& fc)const
{
return CharInfo(_count + fc._count);
}
bool operator != (const CharInfo fc)const
{
return _count != fc._count;
}
bool operator < (const CharInfo& fc)const
{
return _count < fc._count;
}
};
class FileCompress
{
protected:
CharInfo _infos[256];
public:
//預設的建構函式
FileCompress()
{
for (size_t i = 0; i<256; ++i)
{
_infos[i]._ch = i;
}
}
//生成Huffman_code函式
void GenerateHufffmanCode(HuffmanTreeNode<CharInfo> * root, string code)
{
if (root == NULL)return;
if (root->_left == NULL&&root->_right == NULL)//葉子節點
{
_infos[root->_weight._ch]._code = code;
return;
}
GenerateHufffmanCode(root->_left, code + '0');
GenerateHufffmanCode(root->_right, code + '1');
}
string Compress(const char *filename)
{
assert(filename);
FILE *pf = fopen(filename, "rb");
assert(pf);
//fgetc函式的作用是意為從檔案指標stream指向的檔案中讀取一個字元,讀取一個位元組後,游標位置後移一個位元組,返回值為他所讀到的字元,因為返回值要能表示-1,所以返回值型別是int
unsigned char ch = fgetc(pf);
//統計字元出現的次數
while (!feof(pf))//feof檢測檔案流上的結束標誌
{
_infos[ch]._count++;
ch = fgetc(pf);
}
//以該字元出現的次數構建一顆HuffmanTree.
CharInfo invalid; //非法值
HuffmanTree<CharInfo> ht(_infos, 256, invalid);
//生成Huffman編碼
string code;
GenerateHufffmanCode(ht.GetRoot(), code);
//壓縮檔案
fseek(pf, 0, SEEK_SET); //回到檔案頭
string compressfile = filename;
compressfile += ".compress"; //壓縮後的檔名
FILE *fin = fopen(compressfile.c_str(), "wb");
assert(fin);
size_t pos = 0; //記錄位數
unsigned char value = 0;
ch = fgetc(pf);
while (!feof(pf))
{
string &code = _infos[ch]._code;
for (size_t i = 0; i<code.size(); ++i)
{
value <<= 1;
if (code[i] == '1')
value |= 1;
else
value |= 0; //do-nothing
++pos;
if (pos == 8) //滿一個位元組
{
fputc(value, fin);
value = 0;
pos = 0;
}
}
ch = fgetc(pf);
}
if (pos) //解決不足8位的情況.
{
value <<= (8 - pos);
fputc(value, fin);
}
//配置檔案--便於重建Huffman樹
string configfilename = filename;
configfilename += ".config";
FILE *finconfig = fopen(configfilename.c_str(), "wb");
assert(finconfig);
string line;
char buff[128];
for (size_t i = 0; i<256; ++i)
{
//一行一行的讀
if (_infos[i]._count)
{
line += _infos[i]._ch;
line += ",";
line += _itoa(_infos[i]._count, buff, 10);
line += "\n";
//fputs(line.c_str(),finconfig);
fwrite(line.c_str(), 1, line.size(), finconfig);
line.clear();
}
}
fclose(pf);
fclose(fin);
fclose(finconfig);
return compressfile;
}
string UnCompress(const char *filename)
{
assert(filename);
string configfilename = filename;
size_t index = configfilename.rfind(".");
configfilename = configfilename.substr(0, index);
configfilename += ".config";
FILE *foutconfig = fopen(configfilename.c_str(), "rb");
assert(foutconfig);
string line;
//讀取配置檔案--獲取字元出現的次數
unsigned char ch = 0;
while (ReadLine(foutconfig, line))
{
if (line.empty())
{
line += '\n';
continue;
}
//讀到空行
ch = line[0];
_infos[ch]._count = atoi(line.substr(2).c_str());
line.clear();
}
//構建Huffman樹
CharInfo invalid;
HuffmanTree<CharInfo> hft(_infos, 256, invalid);
//根結點的權值也就是字元出現的次數總和
HuffmanTreeNode<CharInfo> *root = hft.GetRoot();
Type charcount = root->_weight._count;
//解壓縮
string uncompressfilename = filename;
index = uncompressfilename.rfind(".");
uncompressfilename = uncompressfilename.substr(0, index);
uncompressfilename += ".uncompress";
FILE *fin = fopen(uncompressfilename.c_str(), "wb");
assert(fin);
//由壓縮檔案還原檔案
string compressfilename = filename;
FILE *fout = fopen(compressfilename.c_str(), "rb");
assert(fout);
HuffmanTreeNode<CharInfo> *cur = root;
int pos = 7;
ch = fgetc(fout);
while (charcount > 0)
{
while (cur)
{
if (cur->_left == NULL && cur->_right == NULL)
{
//葉子結點
fputc(cur->_weight._ch, fin);
cur = root;
--charcount;
if (charcount == 0) //所有的字元都處理完成
break;
}
if (ch & (1 << pos)) //檢查字元的每個位
cur = cur->_right; //1向右走
else
cur = cur->_left; //0向左走
--pos;
if (pos < 0) //一個位元組解壓完成
{
ch = fgetc(fout);
pos = 7;
}
}
}
fclose(foutconfig);
fclose(fin);
fclose(fout);
return uncompressfilename;
}
//讀取一行字元並放在line中
bool ReadLine(FILE *fout, string& line)
{
int ch = fgetc(fout);
if (ch == EOF)
return false;
while (ch != EOF && ch != '\n')
{
line += ch;
ch = fgetc(fout);
}
return true;
}
};
4,測試
#include"huffman_code.h"
void testFileCompress()
{
FileCompress fc;
fc.Compress("1.png");
fc.UnCompress("1.png.compress");
}
int main()
{
//testFileCompress1();
testFileCompress();
system("pause");
return 0;
}