資料結構之哈夫曼編碼
目錄哈夫曼編碼是一種變長編碼,根據字元頻率確定編碼的長度。在學習資料結構時,我們知道,通過貪心的策略自底向上構造二叉樹,最後得到哈夫曼樹。從根節點遍歷,便可以得到編碼。
本文給出了經典教材《資料結構》一書上演算法6.12的具體實現細節。
型別定義
構造二叉樹的過程為:初始為全部字元的 \(n\) 個葉子節點,每次選擇權值最小的兩個根節點合併,形成新的節點,其權值為合併的兩節點權值之和。引入 parent
作為是否為根節點判斷的標誌。
\(n\) 個節點完成 \(n-1\) 次合併操作,形成共包含 \(2n-1\) 個節點的二叉樹,樹的根節點編號為 \(2n-1\)
// 哈夫曼樹節點型別 typedef struct { char data; // 節點字元 double weight; // 節點權值 int parent, lchild, rchild; // 父節點、左右孩子節點 }HfmTNode, *HuffmanTree; // 哈夫曼編碼型別 記錄{字元 -> 編碼} typedef struct { char letter; // 節點字元 char *code; // 節點編碼 }HfmCNode, *HuffmanCode; // 哈夫曼型別 typedef struct { HuffmanTree tree; HuffmanCode code; int n; // 字符集長度 char *letters; // 字符集 int *frequency; // 字元頻率 int rt; // 哈夫曼樹根節點編號,根節點即 `tree[2n-1]` }Huffman;
程式碼實現
參考 《資料結構(C語言版)》
P147 演算法 6.12
哈夫曼編碼
要得到哈夫曼編碼,依次呼叫
- initHuffman(hfm, letters, frequency, n);
- buildHuffmanTree(hfm);
- getHuffmanCode(hfm);
// 初始化哈夫曼 void initHuffman(Huffman *hfm, const char *letters, const int frequency[], int n) { if (n<1) return; int m = 2*n-1; hfm->n = n; hfm->letters = (char*)malloc((n+1)*sizeof(char)); hfm->frequency = (int*)malloc((n+1)*sizeof(int)); hfm->tree = (HuffmanTree)malloc((m+1)* sizeof(HfmTNode)); hfm->rt = m; for (int i=1;i<=n;i++) { hfm->letters[i] = letters[i-1]; hfm->frequency[i] = frequency[i-1]; } for (int i=1;i<=n;i++) hfm->tree[i] = (HfmTNode){letters[i-1], frequency[i-1], 0, 0, 0}; for (int i=n+1;i<2*n;i++) hfm->tree[i] = (HfmTNode){0, 0, 0, 0, 0}; for(int i=n+1;i<=m;i++) { hfm->tree[i].weight = 0; hfm->tree[i].lchild = hfm->tree[i].rchild = hfm->tree[i].parent = 0; } } // 建立哈夫曼樹 void buildHuffmanTree(Huffman *hfm) { // 建立哈夫曼樹 int n = hfm->n; int m = 2*n-1; for(int i=n+1;i<=m;i++) { int p1 = 1, p2 = 1; // p1記錄最小結點位置, p2記錄第二小 while(p1<=i-1 && hfm->tree[p1].parent) p1++; p2 = p1+1; while(p2<=i-1 && hfm->tree[p2].parent) p2++; for(int j=p1+1;j<=i-1;j++) { if (hfm->tree[j].parent) continue; // 非根節點 if(hfm->tree[j].weight<=hfm->tree[p1].weight) { p2 = p1, p1 = j; } else if(hfm->tree[j].weight<hfm->tree[p2].weight) { p2 = j; } } hfm->tree[i].weight = hfm->tree[p1].weight + hfm->tree[p2].weight; hfm->tree[i].lchild = p1; hfm->tree[i].rchild = p2; hfm->tree[p1].parent = i; hfm->tree[p2].parent = i; } } // 獲取哈夫曼編碼 void getHuffmanCode(Huffman *hfm) { // 求赫夫曼編碼 int n = hfm->n; hfm->code = (HuffmanCode)malloc((n+1)*sizeof(HfmCNode)); for (int i=1;i<=n;i++) hfm->code[i] = (HfmCNode){hfm->letters[i], ""}; char *code = (char *)malloc(n*sizeof(char)); code[n-1] = '\0'; for(int i=1;i<=n;i++) { int start = n-1; int c = i, f = hfm->tree[i].parent; while(f) { if(c==hfm->tree[f].lchild) code[--start] = '0'; else code[--start] = '1'; c = f; f = hfm->tree[c].parent; } hfm->code[i].code = (char*)malloc((n-start)*sizeof(char)); strcpy(hfm->code[i].code, &code[start]); } free(code); } // 凹入表示法輸出 void showHuffmanTree(Huffman *hfm, int rt=-1, int level=0) { if (rt==0) return ; if (rt==-1) { printf("HuffmanCode:\n"); for (int i=1;i<=hfm->n;i++) { // printf("%c\n", hfm->letters[i]); // printf("%c\n", hfm->tree[i].data); printf("%c:%s\n", hfm->code[i].letter, hfm->code[i].code); } rt = hfm->rt; printf("HuffmanTree:\n"); } int i; for(i=0;i<level;i++) printf(" "); if (hfm->tree[rt].data==0) printf("**\n"); else printf("%c:%s\n", hfm->tree[rt].data, hfm->code[rt].code); showHuffmanTree(hfm, hfm->tree[rt].lchild, level+1); showHuffmanTree(hfm, hfm->tree[rt].rchild, level+1); }
編碼與譯碼
圖方便,直接使用了C++ string
型別,而不是基於C型別字串(本質上是 char*
字元陣列)
// 編碼
string Encode(Huffman *hfm, const char *input)
{
int cnt = 0;
string output = "";
for (int i=0;input[i];i++)
{
char c = input[i];
for (int i=1;i<=hfm->n;i++)
{
if (hfm->code[i].letter==c)
{
output += hfm->code[i].code;
break;
}
}
if (++cnt<=10)
cout<<output<<endl;
}
return output;
}
// 譯碼
string Decode(Huffman *hfm, const char *input)
{
int p = hfm->rt;
string output = "";
for (int i=0;input[i];i++)
{
char c = input[i];
if(c=='0') p = hfm->tree[p].lchild;
else p = hfm->tree[p].rchild;
if(p<=hfm->n) // 翻譯到葉子節點
{
output += hfm->tree[p].data;
p = hfm->rt;
}
}
return output;
}
功能測試
// 統計文章字元頻率 建立哈夫曼樹
void readTxt2Huffman(const char *filename, Huffman *hfm)
{
FILE *fp = fopen(filename, "r");
if (fp==NULL) return;
char *letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ,.;\'\"";
int frequency[58] = {0}; // 2*26個字母 空格 逗號 句號 分號 單引號 雙引號
while(1)
{
char c = fgetc(fp);
if (feof(fp))
break;
// if (c>='a' && c<='z') c += 'A' - 'a';
if (c>='a' && c<='z') frequency[c-'a']++;
else if (c>='A' && c<='Z') frequency[c-'A'+26]++;
else if (c==' ') frequency[52]++;
else if(c==',') frequency[53]++;
else if(c=='.') frequency[54]++;
else if(c==';') frequency[55]++;
else if(c=='\'') frequency[56]++;
else if(c=='\"') frequency[57]++;
// else printf("%c\n", c);
}
initHuffman(hfm, letters, frequency, 58);
buildHuffmanTree(hfm);
getHuffmanCode(hfm);
}
// 讀檔案,返回char*字串
char* readText(const char* filename)
{
char* text;
FILE *pf = fopen(filename, "r");
if (pf==NULL)
{
printf("檔案%s不存在\n", filename);
return "";
}
fseek(pf, 0, SEEK_END);
long lSize = ftell(pf);
text = (char*)malloc(lSize+1);
rewind(pf);
fread(text, sizeof(char), lSize, pf);
text[lSize] = '\0';
return text;
}
int main()
{
/*
Huffman hfm;
int w[6] = {1, 2, 3, 4, 6, 8};
initHuffman(&hfm, "abcdef", w, 6);
buildHuffmanTree(&hfm);
getHuffmanCode(&hfm);
for (int i=1;i<=6;i++)
{
printf("%c\n", hfm.letters[i]);
printf("%c\n", hfm.tree[i].data);
printf("%s\n", hfm.code[i].code);
}
showHuffmanTree(&hfm);
cout<<Encode(&hfm, "bacbefd")<<endl;
cout<<Decode(&hfm, "100110001011001011100")<<endl;
*/
// 測試讀檔案,完成編碼,譯碼
const char *filename = "article.txt";
Huffman hfm;
readTxt2Huffman(filename, &hfm);
showHuffmanTree(&hfm);
char text[5000];
strcpy(text, readText(filename));
// printf("加密前:\n");
// printf("%s\n", text);
// printf("加密後:\n");
string text_encode = Encode(&hfm, text);
cout<<text_encode<<endl;
cout<<Decode(&hfm, text_encode.c_str())<<endl;
return 0;
}
問題紀錄
-
任務一需要從控制檯讀入 需要按Ctrl Z終止輸入 用 2==scanf()跳出迴圈 -
分配記憶體使用malloc,單塊記憶體大小為 sizeof(xxx) 寫錯了型別,導致程式無輸出也沒有報錯,花費很長時間才定位到錯誤
hfm->code = (HuffmanCode)malloc((n+1)*sizeof(HfmCNode))
-
讀取文章能正常建立哈夫曼樹並編碼 ,譯碼過程出錯。通過輸出譯碼過程,檢查到字符集(包含小寫)與譯碼規則不一致,需要對大小寫特判。完善字符集,包含大小寫和各種符號的字符集作為輸入,便可直接譯碼得到原始輸入。
小結
本人學習《資料結構》這門課是在大一C語言剛結束之後,彼時對C語言的核心——指標還沒完全琢磨透徹。學習資料結構也僅僅按部就班完成了書上的課程實驗,現在回頭看過去寫的程式碼,不僅程式碼風格凌亂,也存在記憶體洩漏的隱患。本次幫學弟寫作業的同時,順便重構了過去的程式碼。最近需要用C/C++進行k-means的演算法優化,也藉此好好熟悉一番傳統的C/C++。
(完)