nlp電商搜尋比賽學習筆記-第一輪
阿新 • • 發佈:2022-03-15
學習網址:https://github.com/datawhalechina/team-learning-data-mining/tree/master/ECommerceSearch
第一輪任務:
任務1:環境配置、實踐資料下載
- 任務內容:
- 從比賽官網下載資料集,並使用Python讀取資料
- 使用
jieba
對文字進行分詞 - 使用
TFIDF
對文字進行編碼 - 思考如何使用TFIDF計算文字相似度?
- 學習資料:https://coggle.club/blog/tianchi-open-search
1、讀取資料:
import pandas as pd import numpy as np data = pd.read_csv('train.query.txt',sep='\t',header=None)
2、使用jieba
import jieba
test2 = jieba.cut(data.iloc[0][1], cut_all=False)
print("精確模式: " + "| ".join(test2))
3、使用TFIDF
class TfIdf(object): def __init__(self, corpus): self.corpus = corpus self.vocab = self.get_vocab() def get_vocab(self): vocab = list() for doc in self.corpus: for w in jieba.lcut(doc): if w not in vocab and len(w) > 1: vocab.append(w) return vocab def get_tfidf(self, doc): item_tfidf = dict() wordList = [w for w in jieba.lcut(doc)] wordSet = set([w for w in wordList if len(w) > 1]) doclen = len(wordList) corpuslen = len(self.corpus) for word in wordSet: tf = wordList.count(word) / doclen idf = math.log(corpuslen + 1 / (len([1 for doc in self.corpus if word in jieba.lcut(doc)]) + 1)) + 1 # 平滑處理 # idf = math.log10(corpuslen + 1 / (len(['' for doc in self.corpus if word in doc]) + 1)) + 1 # 平滑處理 item_tfidf[word] = tf * idf return item_tfidf def transform(self, item_tfidf): arr = list() for w in self.vocab: arr.append(item_tfidf.get(w, 0.0)) return np.array(arr).reshape([-1, len(self.vocab)]) def run(self, mode=1): item_rst = dict() array_rst = np.empty([0, len(self.vocab)]) for idx, doc in enumerate(self.corpus): item_tfidf = self.get_tfidf(doc) item_rst[idx] = item_tfidf arr = self.transform(item_tfidf) arr /= np.sqrt(np.sum(np.power(arr, 2))) # l2正則 array_rst = np.append(array_rst, arr, axis=0) if mode == 0: return item_rst elif mode == 1: return array_rst