【自然語言處理】使用樸素貝葉斯進行語種檢測
阿新 • • 發佈:2019-11-15
首先看一下資料集:
基本上每行就是一句話,所屬類別,這裡包含English, French, German, Spanish, Italian 和 Dutch 6種語言)
先匯入相應的包:
import os from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import re
首先讀取資料集:
def get_train_test_data(): #獲取當前檔案的絕對目錄 path_dir=os.path.dirname(os.path.abspath(__file__)) #獲取資料集 data_path = path_dir + "\\Database\\data.csv" #存放資料 data = [] #存放標籤 label= [] with open(data_path,'r') as fp: lines=fp.readlines() for line in lines: line=line.split(",") data.append(line[0]) label.append(line[1].strip()) #切分資料集 x_train,x_test,y_train,y_test = train_test_split(data,label,random_state=1) return x_train,x_test,y_train,y_test
然後是過濾掉一些噪聲:
\w是匹配包括下劃線的任意字元,\S是匹配任何非空字元,+號表示匹配一個或多個字元
def remove_noise(document): noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"])) clean_text = re.sub(noise_pattern, "", document) return clean_text.strip()
下一步,再降噪資料上抽取出有用的特徵,抽取1-gram和2-gram的統計特徵
vec = CountVectorizer( lowercase=True, # lowercase the text analyzer='char_wb', # tokenise by character ngrams ngram_range=(1,2), # use ngrams of size 1 and 2 max_features=1000, # keep the most common 1000 ngrams preprocessor=remove_noise ) vec.fit(x_train) def get_features(x): vec.transform(x)
最後就是進行分類:
classifier = MultinomialNB() classifier.fit(vec.transform(x_train), y_train) classifier.score(vec.transform(x_test), y_test)
將以上程式碼整合成一個類:
import os from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import re def get_train_test_data(): #獲取當前檔案的絕對目錄 path_dir=os.path.dirname(os.path.abspath(__file__)) #獲取資料集 data_path = path_dir + "\\Database\\data.csv" #存放資料 data = [] #存放標籤 label= [] with open(data_path,'r') as fp: lines=fp.readlines() for line in lines: line=line.split(",") data.append(line[0]) label.append(line[1].strip()) #切分資料集 x_train,x_test,y_train,y_test = train_test_split(data,label,random_state=1) return x_train,x_test,y_train,y_test class LanguageDetector(): def __init__(self,classifier=MultinomialNB()): self.classifier=classifier self.vectorizer=CountVectorizer( lowercase=True, analyzer='char_wb', ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise, ) def _remove_noise(self, document): noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"])) clean_text = re.sub(noise_pattern, "", document) return clean_text def features(self, X): return self.vectorizer.transform(X) def fit(self, X, y): self.vectorizer.fit(X) self.classifier.fit(self.features(X), y) def predict(self, x): return self.classifier.predict(self.features([x])) def score(self, X, y): return self.classifier.score(self.features(X), y) language_detector = LanguageDetector() x_train,x_test,y_train,y_test = get_train_test_data() language_detector.fit(x_train, y_train) print(language_detector.predict('This is an English sentence')) print(language_detector.score(x_test, y_test))
最終結果:
相關資料及程式碼:連結: https://pan.baidu.com/s/1tjHcnZuEdGpDb9vtCHYRWA 提取碼: aqfs&n