pytorch學習之textCNN實現
阿新 • • 發佈:2019-01-29
最近在學pytorch,所以嘗試使用pytorch實現textCNN,ps(git 上有其他人textCNN的實現)。pytorch比tensorflow好的一個地方就在於好學,適合初學者。
首先,要注意的就是這個樣例的資料預處理,我使用的資料是中文文字分類資料集THUCNews,THUCNews是根據新浪新聞RSS訂閱頻道2005~2011年間的歷史資料篩選過濾生成,包含74萬篇新聞文件(2.19 GB),均為UTF-8純文字格式。我們在原始新浪新聞分類體系的基礎上,重新整合劃分出14個候選分類類別:財經、彩票、房產、股票、家居、教育、科技、社會、時尚、時政、體育、星座、遊戲、娛樂。使用THUCTC工具包在此資料集上進行評測,準確率可以達到88.6%。
首先是資料預處理這裡,我們需要提取出中文,去掉那些非中文的字元。
具體函式可以看github,這裡不貼出這塊程式碼。
資料預處理要講原始文字資料轉換為訓練資料。
第一步:資料預處理
def datahelper(dir): #返回為文字,文字對應標籤 labels_index={} index_lables={} num_recs=0 fs = os.listdir(dir) MAX_SEQUENCE_LENGTH = 200 MAX_NB_WORDS = 50000 EMBEDDING_DIM = 20 VALIDATION_SPLIT = 0.2 i = 0; for f in fs: labels_index[f] = i; index_lables[i] = f i = i + 1; print(labels_index) texts = [] labels = [] # list of label ids for la in labels_index.keys(): print(la + " " + index_lables[labels_index[la]]) la_dir = dir + "/" + la; fs = os.listdir(la_dir) for f in fs: file = open(la_dir + "/" + f, encoding='utf-8') lines = file.readlines(); text = '' for line in lines: if len(line) > 5: line = extract_chinese(line) words = jieba.lcut(line, cut_all=False, HMM=True) text = words texts.append(text) labels.append(labels_index[la]) num_recs = num_recs + 1 return texts,labels,labels_index,index_lables
返回的文字為list,需要將list裡面字元單詞替換為數字索引,首先,構建詞表
#詞表
word_vocb=[] word_vocb.append('') for text in texts: for word in text: word_vocb.append(word) word_vocb=set(word_vocb) vocb_size=len(word_vocb)
詞表構建好之後,構建詞表到索引的對映
#詞表與索引的map
word_to_idx={word:i for i,word in enumerate(word_vocb)} idx_to_word={word_to_idx[word]:word for word in word_to_idx}
就可以構建訓練資料
#生成訓練資料,需要將訓練資料的Word轉換為word的索引
for i in range(0,len(texts)):
if len(texts[i])<max_len:
for j in range(0,len(texts[i])):
texts_with_id[i][j]=word_to_idx[texts[i][j]]
for j in range(len(texts[i]),max_len):
texts_with_id[i][j] = word_to_idx['']
else:
for j in range(0,max_len):
texts_with_id[i][j]=word_to_idx[texts[i][j]]
(ps,這裡要注意每個訓練文字的大小要限制在max_len,不夠補充空格即可)
第二步:構建textCNN模型
#textCNN模型
class textCNN(nn.Module):
def __init__(self,args):
super(textCNN, self).__init__()
vocb_size = args['vocb_size']
dim = args['dim']
n_class = args['n_class']
max_len = args['max_len']
embedding_matrix=args['embedding_matrix']
#需要將事先訓練好的詞向量載入
self.embeding = nn.Embedding(vocb_size, dim,_weight=embedding_matrix)
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5,
stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2) # (16,64,64)
)
self.conv2 = nn.Sequential(
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.conv3 = nn.Sequential(
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.conv4 = nn.Sequential( # (16,64,64)
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.out = nn.Linear(512, n_class)
def forward(self, x):
x = self.embeding(x)
x=x.view(x.size(0),1,max_len,word_dim)
#print(x.size())
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = x.view(x.size(0), -1) # 將(batch,outchanel,w,h)展平為(batch,outchanel*w*h)
#print(x.size())
output = self.out(x)
return output
這裡我們使用的embedding層的引數大小為vocb_size*dim,即詞彙表大小乘詞向量的維度,注意,這裡使用的訓練好詞向量的引數,而不是隨機的詞向量。
訓練好的詞向量:
#每個單詞的對應的詞向量
embeddings_index = getw2v()
#預先處理好的詞向量
embedding_matrix = np.zeros((nb_words, word_dim))
for word, i in word_to_idx.items():
if i >= nb_words:
continue
if word in embeddings_index:
embedding_vector = embeddings_index[word]
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
args['embedding_matrix']=torch.Tensor(embedding_matrix)
第三步 訓練
設定的學習率為LR = 0.001,optimiser為Adam,使用的損失函式為 nn.CrossEntropyLoss()。
LR = 0.001
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)
#損失函式
loss_function = nn.CrossEntropyLoss()
#訓練批次大小
epoch_size=1000;
texts_len=len(texts_with_id)
print(texts_len)
#劃分訓練資料和測試資料
x_train, x_test, y_train, y_test = train_test_split(texts_with_id, labels, test_size=0.2, random_state=42)
test_x=torch.LongTensor(x_test)
test_y=torch.LongTensor(y_test)
train_x=x_train
train_y=y_train
test_epoch_size=300;
for epoch in range(EPOCH):
for i in range(0,(int)(len(train_x)/epoch_size)):
b_x = Variable(torch.LongTensor(train_x[i*epoch_size:i*epoch_size+epoch_size]))
b_y = Variable(torch.LongTensor((train_y[i*epoch_size:i*epoch_size+epoch_size])))
output = cnn(b_x)
loss = loss_function(output, b_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(str(i))
print(loss)
pred_y = torch.max(output, 1)[1].data.squeeze()
acc = (b_y == pred_y)
acc = acc.numpy().sum()
accuracy = acc / (b_y.size(0))
acc_all = 0;
for j in range(0, (int)(len(test_x) / test_epoch_size)):
b_x = Variable(torch.LongTensor(test_x[j * test_epoch_size:j * test_epoch_size + test_epoch_size]))
b_y = Variable(torch.LongTensor((test_y[j * test_epoch_size:j * test_epoch_size + test_epoch_size])))
test_output = cnn(b_x)
pred_y = torch.max(test_output, 1)[1].data.squeeze()
# print(pred_y)
# print(test_y)
acc = (pred_y == b_y)
acc = acc.numpy().sum()
print("acc " + str(acc / b_y.size(0)))
acc_all = acc_all + acc
accuracy = acc_all / (test_y.size(0))
print("epoch " + str(epoch) + " step " + str(i) + " " + "acc " + str(accuracy))
具體程式碼在
https://github.com/13061051/PytorchLeran