trie樹-《演算法導論》學習筆記十四
阿新 • • 發佈:2019-02-12
引用一下百度百科的話吧:
Trie樹,又稱單詞查詢樹,是一種樹形結構,是一種雜湊樹的變種。典型應用是用於統計,排序和儲存大量的字串(但不僅限於字串),所以經常被搜尋引擎系統用於文字詞頻統計。它的優點是:利用字串的公共字首來減少查詢時間,最大限度地減少無謂的字串比較,查詢效率比雜湊樹高。
這裡構建了一棵字典樹,每個結點有52個孩子指標,對應26個小寫字母和26個大寫字母,根節點不儲存資料,一個單詞從第一個字母開始經由根結點走對應分支進行插入和統計。
trie樹結點衛星資料包含了字母、出現次數、是否構成一個單詞,孩子指標就是一個52大小的trie樹結點指標陣列。
實現了幾個操作:
1. 插入單詞
遍歷每個字母,從根結點出發,如果結點對應字母的孩子結點為空,就建立結點,出現次數為1,如果存在這個結點,出現次數就+1,並且如果單詞結束,結束處的結點是否構成一個單詞欄位標識為構成
2. 遍歷樹,並列印所有單詞和每個單詞出現次數
3. 統計樹,按給定的數字統計出現次數前幾的單詞
樹統計,與遍歷類似,用尾遞迴,並傳入一個大於單詞最大長度的陣列來儲存每個分支的單詞,如果遇到結點能構成一個單詞,就判斷你單詞個數,並以插入排序的方式插入建立的統計連結串列(類似打撲克的插排序);
統計連結串列有更新操作,根據輸入的統計前幾的數字來維護這個連結串列該去掉哪些結點,該更新哪些結點的順序等
獲取單詞來源為編寫的一個簡單單詞隨機生成程式碼,寫入一個檔案中,可指定單詞最大長度,全大寫/全小寫/大小寫均有,單詞個數,單詞範圍(只支援a-*或A-*,例如5,就是生成a-e/A-E的單詞)
貼程式碼:
隨機生成單詞
#include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <time.h> #include <unistd.h> #include <errno.h> #include <string.h> #include <stdlib.h> int word_len = 0; int upper_low = 65; int lowwer_low = 97; // if letter_size = 5 // it will generate a-e or A-E letter. int letter_size = 0; int random_letter() { return rand() % letter_size; } int random_word( char *word, int opt ) { // minimum word'length is 3. int true_word_len = rand() % word_len + 3; int true_word_len1 = true_word_len; while ( true_word_len-- ) { char letter = 0; if ( opt == 0 ) { letter = random_letter() + lowwer_low; } else if ( opt == 1 ) { letter = random_letter() + upper_low; } else { int opt_case = rand() % 2; if ( opt_case == 0 ) letter = random_letter() + lowwer_low; else letter = random_letter() + upper_low; } word[true_word_len] = letter; } return true_word_len1; } void gen_word( int fd, int word_num, int opt ) { char word[20] = {0}; int true_word_len = 0; while ( word_num-- ) { memset( word, 0, 20); true_word_len = random_word( word, opt ); word[true_word_len] = '\n'; write( fd, word, true_word_len + 1 ); } } int main( int argc, char **argv ) { srand((int)time(NULL)); if ( argc != 5 ) { printf("please input " "word's length & " "words' number & " "word's range & " "gen_case(0:lowwer case,1:upper case,other:both\n"); exit( 0 ); } word_len = atoi( argv[1] ); int word_num = atoi( argv[2] ); letter_size = atoi( argv[3] ); int opt = atoi( argv[4] ); int fd = open("word.txt", O_RDWR | O_TRUNC, 0777); gen_word( fd, word_num, opt ); close( fd ); return 0; }
trie樹
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#define MAX_CHILD_NUM 52
#define UPPER_LOW 65
#define UPPER_UP 90
#define LOWER_LOW 97
#define LOWER_UP 122
#define PRINT(format, arg...) \
do { \
printf("[%s/%d]:", __func__, __LINE__); \
printf(format, ##arg); \
printf("\n"); \
}while(0)
typedef struct trieTreeNode {
char letter;
int count;
int is_word;
struct trieTreeNode *next[MAX_CHILD_NUM];
} trieTreeNode;
typedef struct trieTree {
trieTreeNode *root;
} trieTree;
typedef struct count_data {
int order;
int count;
char string[20];
struct count_data *next;
} count_data;
int trans_letter_2_index(
char letter )
{
int index = -1;
if ( letter >= LOWER_LOW
&& letter <= LOWER_UP ) {
index = letter - LOWER_LOW + 26;
} else if ( letter >= UPPER_LOW
&& letter <= UPPER_UP ) {
index = letter - UPPER_LOW;
} else {
PRINT("error letter input:%c", letter);
exit( 0 );
}
return index;
}
trieTreeNode *create_node(
char letter )
{
trieTreeNode *node =
( trieTreeNode * )calloc( 1, sizeof(trieTreeNode) );
node->letter = letter;
node->count = 0;
node->is_word = 0;
}
void insert(
trieTreeNode *root,
char *word )
{
if ( root == NULL ) {
PRINT("root node is null.");
return;
}
int i = 0;
trieTreeNode *cur = root;
for ( i; word[i] != '\0'; i++ ) {
int next_index = trans_letter_2_index(word[i]);
//PRINT("letter:%c, index:%d", word[i], next_index);
if ( cur->next[next_index] == NULL ) {
cur->next[next_index] = create_node( word[i] );
} else {
//cur->next[next_index]->count += 1;
}
if ( word[i+1] == '\0' ) {
cur->next[next_index]->count += 1;
cur->next[next_index]->is_word = 1;
}
cur = cur->next[next_index];
}
}
// 刪除連結串列所有結點
void delete_list_all_node(
count_data *node )
{
count_data *p = NULL;
while ( node ) {
p = node;
node = node->next;
free( p );
}
}
void print_list_all_node(
count_data *node )
{
printf("\n");
node = node->next;
while ( node ) {
printf("[%d],count:%d\tword:%s\n",
node->order, node->count, node->string);
node = node->next;
}
printf("\n");
}
void update_insert_node(
count_data *insert_node )
{
if ( !insert_node->next )
return;
count_data *print_p = insert_node;
if ( insert_node->order == 1 ) {
delete_list_all_node( insert_node->next );
insert_node->next = NULL;
} else if ( insert_node->order < 1 ) {
PRINT("ERROR!!!!!");
exit( 0 );
} else {
count_data *p = insert_node;
insert_node = insert_node->next;
while ( insert_node ) {
if ( insert_node->count < p->count ) {
insert_node->order = p->order - 1;
} else if ( insert_node->count > p->count ) {
PRINT("ERROR!!!cur->count:%d, pre->count:%d",
insert_node->count, p->count);
exit( 0 );
} else {
insert_node->order = p->order;
}
if ( insert_node->order < 1 ) {
delete_list_all_node( insert_node );
p->next = NULL;
break;
}
p = insert_node;
insert_node = insert_node->next;
}
}
}
void list_insert(
char *tmp_word,
int cur_count,
int tail,
count_data *head,
int top_num )
{
tmp_word[tail] = '\0';
count_data *new_data = ( count_data * )malloc( sizeof(count_data) );
new_data->count = cur_count;
memcpy( new_data->string, tmp_word, tail + 1 );
new_data->next = NULL;
//PRINT("count:%d\ttmp_word:%s, string:%s", cur_count, tmp_word, new_data->string);
if ( head->next == NULL ) {
head->next = new_data;
new_data->order = top_num;
} else if ( cur_count > head->next->count ) {
new_data->order = head->next->order;
new_data->next = head->next;
head->next = new_data;
update_insert_node( new_data );
} else {
while ( 1 ) {
head = head->next;
if ( head->next == NULL ) {
if ( head->order > 1 ) {
head->next = new_data;
if ( head->count == new_data->count )
new_data->order = head->order;
else
new_data->order = head->order - 1;
head->next = new_data;
} else if ( head->count > new_data->count ) {
// 不插入
free( new_data );
} else if ( head->count == new_data->count ) {
head->next = new_data;
new_data->order = head->order;
} else if ( head->count < new_data->count ) {
// 此種情況只有求出現次數最多的前1個單詞時有
head->count = new_data->count;
free( new_data );
}
break;
} else if ( head->count >= cur_count
&& head->next->count < cur_count ) {
new_data->next = head->next;
head->next = new_data;
new_data->order = head->order;
update_insert_node( new_data );
break;
}
}
}
}
void find_top_count1(
trieTreeNode *root,
char *tmp_word,
int tail,
count_data *head,
int top_num )
{
if ( !root )
return;
tmp_word[tail] = root->letter;
tail++;
if ( root->is_word ) {
/*
printf("\n--------------before delete------------------\n");
print_list_all_node( head );
printf("\n--------------------------------------------\n");
*/
list_insert( tmp_word, root->count, tail, head, top_num );
/*
printf("\n--------------------after delete----------------------------\n");
print_list_all_node( head );
printf("\n-----------------------------------------------------------\n");
*/
}
int i = 0;
for ( i; i < MAX_CHILD_NUM; i++ ) {
find_top_count1( root->next[i], tmp_word, tail, head, top_num );
}
}
void find_top_count(
trieTreeNode *root,
int top_num )
{
if ( !root )
return;
int i = 0;
count_data *head = ( count_data * )malloc( sizeof(count_data) );
for ( i; i < MAX_CHILD_NUM; i++ ) {
char tmp_word[20] = {0};
find_top_count1( root->next[i], tmp_word, 0, head, top_num );
}
printf("出現次數最大前%d次的單詞:\n", top_num);
count_data *p = head->next;
count_data *free_p = NULL;
while ( p != NULL ) {
free_p = p;
printf("前%d,count:%d\t%s\n", p->order, p->count, p->string);
p = p->next;
free( free_p );
}
free( head );
}
void tree_walk1(
trieTreeNode *root,
char *tmp_word,
int tail )
{
if ( !root )
return;
tmp_word[tail] = root->letter;
tail++;
//printf("%c\n", root->letter);
if ( root->is_word ) {
int j = 0;
printf("count:%d\t", root->count);
for ( j; j < tail; j++ ) {
printf("%c", tmp_word[j]);
}
printf("\n");
}
int i = 0;
for ( i; i < MAX_CHILD_NUM; i++ ) {
tree_walk1( root->next[i], tmp_word, tail );
}
}
void tree_walk(
trieTreeNode *root )
{
if ( !root )
return;
int i = 0;
for ( i; i < MAX_CHILD_NUM; i++ ) {
char tmp_word[20] = {0};
tree_walk1( root->next[i], tmp_word, 0 );
}
}
int main(
int argc,
char **argv )
{
if ( argc != 3 ) {
PRINT("USAGE: please input words file & top number");
exit( 0 );
}
char *file_name = argv[1];
int top_num = atoi( argv[2] );
trieTree *tree = ( trieTree * )malloc( sizeof(trieTree) );
tree->root = create_node( -1 );
int fd = open(file_name, O_RDONLY);
if ( fd < 0 ) {
PRINT("OPEN FILE %s ERROR!!!(%s)", file_name, (char *)strerror(errno));
exit( 0 );
}
// 每次讀取檔案的緩衝區
char buf[1024 * 10] = {0};
// 每次讀取的大小
int read_len = 1024;
// 讀取的返回值
int read_bytes = 0;
// 從讀取的緩衝區每次提取'\n' - '\n'之間的單詞
char tmp_word[20] = {0};
// 讀取檔案緩衝區如果出現單詞隔斷,剩餘部分在下一次
// read才能讀到,這個index做單詞繼續拼接
int tmp_index = 0;
while ( 1 ) {
memset( buf, 0, read_len );
read_bytes = read( fd, buf, read_len );
if ( read_bytes <= 0 )
break;
//printf("readbytes:%d------\n%s\n", read_bytes, buf);
int cur = 0;
while ( cur < read_bytes ) {
// 單詞檔案最後一個單詞末尾一定要有'\n'
if ( buf[cur] == '\n' ) {
tmp_word[tmp_index] = '\0';
//printf("insert word:%s\n", tmp_word);
insert( tree->root, tmp_word );
memset( tmp_word, 0, 20 );
tmp_index = 0;
} else {
tmp_word[tmp_index] = buf[cur];
tmp_index++;
}
cur++;
}
}
printf("\n========================================\n");
tree_walk( tree->root );
find_top_count( tree->root, top_num );
close( fd );
return 0;
}
trie樹的程式碼使用:./xxx word.txt 10即統計出現次數前10的單詞,並列印單詞和次數
例如對生成了10000個單詞的word.txt檔案,統計前5:
./xxx word.txt 5