LSH(區域性敏感雜湊演算法)實現文字的相似性比對
阿新 • • 發佈:2019-02-17
# @Time : 2017/10/19 10:09 # @Author : Jalin Hu # @File : main.py # @Software: PyCharm import os import jieba import collections import random from lshash.lshash import LSHash '''函式說明:獲取詞彙集合 Parameters: foldpath - 論文資料夾路徑 Returns: datalist_dict - 詞集字典(名字:詞集)''' def textprocess(foldpath): datalist = [] classlist = [] # datalist_dict = {} vocabset = collections.defaultdict(int) filelist = os.listdir(foldpath) # 獲取paper資料夾下面所有的檔名 for file in filelist: with open(os.path.join(foldpath, file), 'r', encoding='utf-8') as f: sequence = f.read() key = file.strip('.txt').strip('[').strip(']').strip(r"\\'") datalist.append(jieba.lcut(sequence, cut_all=False)) classlist.append(key) print(key, ':**************ok') for content in datalist: for word in content: vocabset[word] += 1 all_word_sorted = sorted(vocabset.items(), key=lambda e: e[1], reverse=True) all_word_list, all_word_nums = zip(*all_word_sorted) return datalist, classlist, list(all_word_list) # # return datalist, classlist # data_class_list = list(zip(datalist, classlist)) # # print(data_class_list) # random.shuffle(data_class_list) # index = int(len(data_class_list) * testsize) + 1 # 訓練集和測試集區分的索引 # traindatalist, trainclasslist = zip(*(data_class_list[index:])) # 訓練集解壓縮 # testdatalist, testclasslist = zip(*(data_class_list[:index])) # 測試集解壓縮 # # # 統計訓練集詞頻 # allworddict = collections.defaultdict(int) # 建立預設字典 # for word_list in traindatalist: # for word in word_list: # allworddict[word] += 1 # # # 根據鍵的值倒序排列 # all_word_sorted = sorted(allworddict.items(), key=lambda e: e[1], reverse=True) # all_word_list, all_word_nums = zip(*all_word_sorted) # all_word_list = list(all_word_list) # return all_word_list, traindatalist, trainclasslist, testdatalist, testclasslist '''函式說明:讀取檔案裡的內容,並去重 Parameters: words_file - 檔案路徑 Returns: words_set - 讀取的內容的set集合''' def make_word_set(word_file): word_set = set() with open(word_file, 'r', encoding='utf-8') as f: for line in f.readlines(): word = line.strip() if len(word) > 0: word_set.add(word) return word_set '''函式說明:文字特徵選取 Parameters: all_words_list - 訓練集所有文字列表 deleteN - 刪除詞頻最高的deleteN個詞 stopwords_set - 指定的結束語 Returns: feature_words - 特徵集''' def word_dict(vocabset, deleteN, stopwords_set): feature_words = [] for i in range(deleteN, len(vocabset), 1): if not vocabset[i].isdigit() and vocabset[i] not in stopwords_set and 1 < len( vocabset[i]) < 5: feature_words.append(vocabset[i]) return feature_words '''函式說明:向量化 Parameters: vocablist - 所有特徵集 inputset - 輸入的詞集 Returns: returnvec - 向量''' def bagof_word2vec(vocablist, inputset): returnvec = [0] * len(vocablist) for word in inputset: if word in vocablist: returnvec[vocablist.index(word)] += 1 else: print('word:', word, 'is not in the list_vec') return returnvec if __name__ == '__main__': datalist, classlist, vocabset = textprocess('./paper') # 獲取每篇論文的詞集 stop_word_file = './stopwords_cn.txt' stop_word_set = make_word_set(stop_word_file) feature_words = word_dict(vocabset, 0, stop_word_set) trainMat = [] lsh = LSHash(hash_size=10, input_dim=len(feature_words)) for postinDoc in datalist: trainMat_vec = bagof_word2vec(feature_words, postinDoc) # 訓練集向量化 trainMat.append(trainMat_vec) lsh.index(trainMat_vec) testfile = './test.txt' testlist = [] with open(testfile, 'r', encoding='utf-8') as f: sequence = f.read() testlist.append(jieba.lcut(sequence, cut_all=False)) testvect = bagof_word2vec(feature_words, testlist[0]) re = lsh.query(testvect, num_results=1) print(list(re[0][0])) print(trainMat.index(list(re[0][0]))) print('最相似的論文是:', classlist[trainMat.index(list(re[0][0]))])