1. 程式人生 > >基於Huffman編碼的檔案壓縮

基於Huffman編碼的檔案壓縮

檔案壓縮有很多種演算法本文介紹的是基於Huffman演算法的檔案壓縮
對於Huffman壓縮最重要的就是建立Huffman樹與重建Huffman樹,本文對如何建立Huffman樹不做重點討論
首先將原始檔遍歷一遍統計其中每個字元出現的次數並將其 儲存在下面的結構體中

struct FileInfo
{
    FileInfo(unsigned char ch = char())
        :_ch(ch)
        , _count(0)
    {}

    FileInfo& operator=(const FileInfo& x)
    {
        this
->coding = x.coding; this->_ch = x._ch; this->_count = x._count; return *this; } bool operator==(const FileInfo& x)const { return !(*this != x); } bool operator!=(const FileInfo& x)const { return this->_count != x._count; } unsigned
char _ch; //出現的字元 long long _count; //該字元出現的次數 string coding; //該字元的Huffman編碼 }; FileInfo operator+(const FileInfo& left, const FileInfo& right) { FileInfo ret; ret._count = left._count + right._count; return ret; } bool operator<(const FileInfo& left, const FileInfo& right) { return
left._count < right._count; }

檔案壓縮的類

class CompressedFile
{
public:
    CompressedFile()//建構函式
    {
        for (size_t i = 0; i < 256; i++)
        {
            _FileInfo[i]._ch = i;
        }
    }
    void Compressed(string& readname);//壓縮函式
    void UnCompressed(const string& readname);//解壓函式

private:
    FileInfo _FileInfo[256];//檔案中總共可能出現256種字元
};

壓縮檔案

void Compressed(string& readname)
    {
        FILE* readfile = fopen(readname.c_str(), "rb");
        assert(readfile);

        unsigned char* readstr = new unsigned char[1024];
        size_t readcount = 0;
        readcount = fread(readstr, 1, 1024, readfile);  //每次讀1kb的資料減少不必要的I/O次數一提高效率
        while (readcount != 0)
        {
            for (size_t i = 0; i < readcount; i++)
            {
                _FileInfo[readstr[i]]._count++;        //統計每個字元出現的次數
            }
            readcount = fread(readstr, 1, 1024, readfile);
        }

        FileInfo invalue;
        HuffmanTree<FileInfo> ht(_FileInfo, 256, invalue);       //建立Huffman樹
        Coding(ht);                                             //建立Huffman編碼
        fseek(readfile, 0, SEEK_SET);                            //是檔案指標回到檔案頭重新遍歷檔案

        string FileHead = GetPost(readname);                     //將編碼的資訊與原始檔的字尾寫在檔案頭

        string CompressedFileName = readname.substr(0, readname.find_last_of('.'));
        CompressedFileName += ".huf";

        FileHead += '\n';

        size_t valuecount = 0;
        string strvalue;

        for (size_t i = 0; i < 256; i++)
        {
            if (_FileInfo[i]._count)
            {
                string ptmp;
                char strptmp[20] = { 0 };
                ptmp += _FileInfo[i]._ch;
                ptmp += ',';
                _i64toa(_FileInfo[i]._count, strptmp, 10);
                ptmp += strptmp;
                ptmp += '\n';
                strvalue += ptmp;
                valuecount++;
            }
        }

        char  str[10] = { 0 };
        _itoa(valuecount, str, 10);

        FileHead += str;
        FileHead += '\n';
        FileHead += strvalue;

        FILE* writefile = fopen(CompressedFileName.c_str(), "wb");
        assert(writefile);

        fwrite(FileHead.c_str(),1,FileHead.length(),writefile);
        char* writestr = new char[1024];
        size_t writecount = 0;
        size_t idx = 0;                                       //一次寫八位用idx來標記
        char value = 0;                                       //要寫入的值
        readcount = fread(readstr, 1, 1024, readfile);
        HuffmanNode<FileInfo>* _root = ht.GetRoot();
        long long weight = _root->_weight._count / 1024;
        long long arv = weight / 100;
        long long k = 0;
        char __arr[102] = { 0 };
        while (readcount)
        {
            k++;
            if (k == arv)
            {
                cout << '*';                                 //模擬實現簡單的進度條
                k = 0;
            }
            for (size_t i = 0; i < readcount; i++)
            {
                string coding = _FileInfo[readstr[i]].coding;

                for (size_t j = 0; j < coding.length(); ++j)
                {
                    value <<= 1;
                    if (coding[j] == '1')
                        value |= 1;

                    if (++idx == 8)                               //將idx value 清零
                    {
                        writestr[writecount++] = value;
                        if (writecount == 1024)
                        {
                            fwrite(writestr, 1, 1024, writefile);
                            writecount = 0;
                        }
                        idx = 0;
                        value = 0;
                    }
                }
            }

            readcount = fread(readstr, 1, 1024, readfile);
        }

        if (idx)                                     //迴圈退出時該位元組沒有寫滿將其高位補齊
        {
            value <<= (8 - idx);
            writestr[writecount++] = value;
        }

        if (writecount)
        {
            fwrite(writestr, 1, writecount, writefile);
        }

        delete[] readstr;
        delete[] writestr;
        fclose(readfile);
        fclose(writefile);  
    }


void Coding(HuffmanTree<FileInfo>& ht)
    {
        HuffmanNode<FileInfo>* root = ht.GetRoot();
        size_t count = 0;
        GetLeaf(root, count);                                 //只對葉子節點進行編碼
        cout << "有效字元個數為count = " << count << endl;
    }

    void GetLeaf(HuffmanNode<FileInfo>* root, size_t& count)
    {
        if (root)
        {
            GetLeaf(root->_pLeft, count);
            GetLeaf(root->_pRight, count);

            if (root->_pLeft == NULL && root->_pRight == NULL)
            {
                count++;
                HuffmanNode<FileInfo>* child = root;
                HuffmanNode<FileInfo>* parent = child->_pParent;
                string& coding = _FileInfo[root->_weight._ch].coding;

                while (parent)                                        //找到葉子節點後記錄從其到根的路徑
                {
                    if (child == parent->_pLeft)
                    {
                        coding += '0';                                //約定是雙親的左孩子寫入字元0
                    }
                    else
                    {
                        coding += '1';                               //約定是雙親的右孩子寫入字元1
                    }

                    child = parent;
                    parent = child->_pParent;
                }

                reverse(coding.begin(), coding.end());             //因為是從葉子到根進行遍歷所一要將其逆置
            }
        }
    }

解壓檔案

void UnCompressed(const string& readname)
    {
        FILE* readfile = fopen(readname.c_str(), "rb");
        assert(readfile);

        string writename = readname.substr(0, readname.find_last_of('.'));  //首先讀到原始檔的字尾
        string strptmmp;
        GetLine(readfile, strptmmp);
        if (strptmmp.length())
        {
            writename += strptmmp;
        }

        int num = 0;
        strptmmp = "";
        GetLine(readfile,strptmmp);
        if (strptmmp.length())
        {
            num = atoi(strptmmp.c_str());
        }

        for (int i = 0; i < num; ++i)
        {

            strptmmp = "";
            GetLine(readfile,strptmmp);
            unsigned char ch = strptmmp[0];                        //必須強制裝換為無符號型不然出現負數,負數作為下標會導致程式崩潰,但是本程式會在這裡奔潰會在本函式退出是崩潰除錯了好長時間都沒有注意到
            _FileInfo[ch]._count = atoi(strptmmp.c_str()+2);       //獲取原始檔裡每個字元出現的次數
        }

        FILE* writefile = fopen(writename.c_str(), "wb");
        assert(writefile);

        FileInfo invalue;
        HuffmanTree<FileInfo> ht(_FileInfo, 256, invalue);     //根據讀到的原始檔裡的每個字元出現的次數重建Huffman樹
        HuffmanNode<FileInfo>* _root = ht.GetRoot();

        unsigned char* readstr = new unsigned char[1024];
        unsigned char* writestr = new unsigned char[1024];
        size_t writecount = 0;
        unsigned char ch;
        int pos = 7;                                      //標記該字元的每一位對其每一位進行處理
        long long filesize = _root->_weight._count;
        HuffmanNode<FileInfo> * root = _root;

        size_t readcount = fread(readstr, 1, 1024, readfile);
        while (readcount)
        {
            for (size_t i = 0; i < readcount;)
            {
                ch = readstr[i];

                while (NULL != root->_pLeft || NULL != root->_pRight)                 //找到葉子節點其中的字元就是要寫入的字元
                {
                    if (ch & (1 << pos--))
                        root = root->_pRight;                        
                    else
                        root = root->_pLeft;
                    if (pos < 0)
                    {
                        pos = 7;
                        ch = readstr[++i];
                        break;
                    }
                }

                if (NULL == root->_pLeft && NULL == root->_pRight)
                {
                    writestr[writecount++] = root->_weight._ch;
                    filesize--;
                    root = _root;
                    if (0 == filesize)                              //已經寫入了和原始檔大小相等的字元數退出該程式
                    {
                        fwrite(writestr, 1, writecount, writefile);
                        return;
                    }

                    if (1024 == writecount)
                    {
                        fwrite(writestr, 1, 1024, writefile);
                        writecount = 0;
                    }
                }
            }

            readcount = fread(readstr, 1, 1024, readfile);
        }
    }
void GetLine(FILE* fp,string& line)  //一次讀取一行
    {
        char first;
        if (!feof(fp))
        {
            first = fgetc(fp);
            line += first;
        }
        while (!feof(fp))
        {
            char ch = fgetc(fp);
            if (ch != '\n')
            {
                line += ch;
            }
            else
            {
                break;
            }
        }