1. 程式人生 > 實用技巧 >Pytorch-LSTM+Attention文字分類

Pytorch-LSTM+Attention文字分類

語料連結:https://pan.baidu.com/s/1a1J_LigAig-80W6IenCyZg
提取碼:hbx1

train.txt pos/neg各500條,一共1000條(用於訓練模型)
dev.txt pos/neg各100條,一共200條(用於調引數)
test.txt pos/neg各150條,一共300條(用於測試)

例如:下面是一個正面樣本的例子。
<Polarity>1</Polarity>
<text>sit back in one of those comfortable chairs.</text>

1.資料預處理

載入資料、建立vocabulary、建立iterator,前面部落格裡寫過類似的就不重複了,直接放程式碼。

 1 import numpy as np
 2 import torch
 3 from torch import nn, optim
 4 import torch.nn.functional as F
 5 from torchtext import data
 6 
 7 import math
 8 import time
 9 
10 
11 SEED = 123
12 BATCH_SIZE = 128
13 LEARNING_RATE = 1e-3      #學習率
14 EMBEDDING_DIM = 100       #詞向量維度
15 
16 #為CPU設定隨機種子
17
torch.manual_seed(SEED) 18 19 TEXT = data.Field(tokenize=lambda x: x.split(), lower=True) 20 LABEL = data.LabelField(dtype=torch.float) 21 22 #get_dataset返回Dataset所需的examples和fields 23 def get_dataset(corpur_path, text_field, label_field): 24 fields = [('text', text_field), ('label', label_field)] #
torchtext檔案配對關係 25 examples = [] 26 27 with open(corpur_path) as f: 28 li = [] 29 while True: 30 content = f.readline().replace('\n', '') 31 if not content: #為空行,表示取完一次資料(一次的資料儲存在li中) 32 if not li: #如果列表也為空,則表示資料讀完,結束迴圈 33 break 34 label = li[0][10] 35 text = li[1][6:-7] 36 examples.append(data.Example.fromlist([text, label], fields)) 37 li = [] 38 else: 39 li.append(content) #["<Polarity>標籤</Polarity>", "<text>句子內容</text>"] 40 41 return examples, fields 42 43 #得到構建Dataset所需的examples和fields 44 train_examples, train_fields = get_dataset("corpurs/trains.txt", TEXT, LABEL) 45 dev_examples, dev_fields = get_dataset("corpurs/dev.txt", TEXT, LABEL) 46 test_examples, test_fields = get_dataset("corpurs/tests.txt", TEXT, LABEL) 47 48 49 #構建Dataset資料集 50 train_data = data.Dataset(train_examples, train_fields) 51 dev_data = data.Dataset(dev_examples, dev_fields) 52 test_data = data.Dataset(test_examples, test_fields) 53 54 print('len of train data:', len(train_data)) #1000 55 print('len of dev data:', len(dev_data)) #200 56 print('len of test data:', len(test_data)) #300 57 58 print(train_data.examples[15].text) 59 print(train_data.examples[15].label) 60 61 62 #建立vocabulary 63 TEXT.build_vocab(train_data, max_size=5000, vectors='glove.6B.100d') 64 LABEL.build_vocab(train_data) 65 print(len(TEXT.vocab)) #3287 66 print(TEXT.vocab.itos[:12]) #['<unk>', '<pad>', 'the', 'and', 'a', 'to', 'is', 'was', 'i', 'of', 'for', 'in'] 67 print(TEXT.vocab.stoi['like']) #43 68 print(LABEL.vocab.stoi) #defaultdict(None, {'0': 0, '1': 1}) 69 70 71 #建立iterators,每個itartion都會返回一個batch的examples 72 train_iterator, dev_iterator, test_iterator = data.BucketIterator.splits( 73 (train_data, dev_data, test_data), 74 batch_size=BATCH_SIZE, 75 sort = False)

2.定義模型

2.1形式一:根據注意力機制的定義求解

 1 class BiLSTM_Attention(nn.Module):  
 2     
 3     def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
 4         
 5         super(BiLSTM_Attention, self).__init__()
 6         
 7         self.hidden_dim = hidden_dim
 8         self.n_layers = n_layers
 9         self.embedding = nn.Embedding(vocab_size, embedding_dim)        
10         self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=0.5)
11         self.fc = nn.Linear(hidden_dim * 2, 1)
12         self.dropout = nn.Dropout(0.5)
13         
14     #x,query:[batch, seq_len, hidden_dim*2]
15     def attention_net(self, x, query, mask=None):      #軟性注意力機制(key=value=x)
16     
17         d_k = query.size(-1)                                              #d_k為query的維度
18         scores = torch.matmul(query, x.transpose(1, 2)) / math.sqrt(d_k)  #打分機制  scores:[batch, seq_len, seq_len]
19         
20         p_attn = F.softmax(scores, dim = -1)                              #對最後一個維度歸一化得分
21         context = torch.matmul(p_attn, x).sum(1)       #對權重化的x求和,[batch, seq_len, hidden_dim*2]->[batch, hidden_dim*2]
22         return context, p_attn             
23         
24     
25     def forward(self, x):     
26         embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]
27         
28         # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]
29         output, (final_hidden_state, final_cell_state) = self.rnn(embedding)
30         output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]
31         
32         query = self.dropout(output)
33         attn_output, attention = self.attention_net(output, query)       #和LSTM的不同就在於這一句
34         logit = self.fc(attn_output)
35         return logit

2.2形式二:參考:https://blog.csdn.net/qsmx666/article/details/107118550

Attention公式:

圖中的Ww和uw對應了下面程式碼中的w_omega和u_omega,隨機初始化而來,hit對應x。

 1 class BiLSTM_Attention(nn.Module):  
 2     
 3     def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
 4         
 5         super(BiLSTM_Attention, self).__init__()
 6         
 7         self.hidden_dim = hidden_dim
 8         self.n_layers = n_layers
 9         self.embedding = nn.Embedding(vocab_size, embedding_dim)        #單詞數,嵌入向量維度
10         self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=0.5)
11         self.fc = nn.Linear(hidden_dim * 2, 1)
12         self.dropout = nn.Dropout(0.5)
13            
14         # 初始時間步和最終時間步的隱藏狀態作為全連線層輸入
15         self.w_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
16         self.u_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, 1))
17 
18         nn.init.uniform_(self.w_omega, -0.1, 0.1)
19         nn.init.uniform_(self.u_omega, -0.1, 0.1)
20     
21     
22     def attention_net(self, x):       #x:[batch, seq_len, hidden_dim*2]
23         
24         u = torch.tanh(torch.matmul(x, self.w_omega))         #[batch, seq_len, hidden_dim*2]
25         att = torch.matmul(u, self.u_omega)                   #[batch, seq_len, 1]
26         att_score = F.softmax(att, dim=1) 
27                  
28         scored_x = x * att_score                              #[batch, seq_len, hidden_dim*2]
29         
30         context = torch.sum(scored_x, dim=1)                  #[batch, hidden_dim*2]
31         return context
32         
33     
34     def forward(self, x):     
35         embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]
36         
37         # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]
38         output, (final_hidden_state, final_cell_state) = self.rnn(embedding)
39         output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]
40             
41         attn_output = self.attention_net(output)
42         logit = self.fc(attn_output)
43         return logit

使用模型,使用預訓練過的embedding來替換隨機初始化,定義優化器、損失函式。

1 rnn = BiLSTM_Attention(len(TEXT.vocab), EMBEDDING_DIM, hidden_dim=64, n_layers=2)         #81.49%        單獨的RNN是78.08%            
2 
3 pretrained_embedding = TEXT.vocab.vectors
4 print('pretrained_embedding:', pretrained_embedding.shape)      #torch.Size([3287, 100])
5 rnn.embedding.weight.data.copy_(pretrained_embedding)
6 print('embedding layer inited.')
7 
8 optimizer = optim.Adam(rnn.parameters(), lr=LEARNING_RATE)
9 criteon = nn.BCEWithLogitsLoss()

3.訓練、評估模型

常規套路:計算準確率、訓練函式、評估函式、列印模型表現、用儲存的模型引數預測測試資料。

 1 #計算準確率
 2 def binary_acc(preds, y):
 3     preds = torch.round(torch.sigmoid(preds))
 4     correct = torch.eq(preds, y).float()
 5     acc = correct.sum() / len(correct)
 6     return acc    
 7 
 8 
 9 #訓練函式
10 def train(rnn, iterator, optimizer, criteon):
11     
12     avg_loss = []
13     avg_acc = []
14     rnn.train()        #表示進入訓練模式
15     
16     for i, batch in enumerate(iterator):
17         
18         pred = rnn(batch.text).squeeze()             #[batch, 1] -> [batch]
19         
20         loss = criteon(pred, batch.label)
21         acc = binary_acc(pred, batch.label).item()   #計算每個batch的準確率
22         
23         avg_loss.append(loss.item())
24         avg_acc.append(acc)
25         
26         optimizer.zero_grad()
27         loss.backward()
28         optimizer.step()
29         
30     avg_acc = np.array(avg_acc).mean()
31     avg_loss = np.array(avg_loss).mean()
32     return avg_loss, avg_acc                          
33     
34     
35 #評估函式
36 def evaluate(rnn, iterator, criteon):    
37     
38     avg_loss = []
39     avg_acc = []    
40     rnn.eval()         #表示進入測試模式
41     
42     with torch.no_grad():
43         for batch in iterator:
44             
45             pred = rnn(batch.text).squeeze()        #[batch, 1] -> [batch]
46             
47             loss = criteon(pred, batch.label)
48             acc = binary_acc(pred, batch.label).item()
49             
50             avg_loss.append(loss.item())
51             avg_acc.append(acc)
52         
53     avg_loss = np.array(avg_loss).mean()
54     avg_acc = np.array(avg_acc).mean()
55     return avg_loss, avg_acc
56 
57 
58 #訓練模型,並列印模型的表現
59 best_valid_acc = float('-inf')    
60 
61 for epoch in range(30):
62     
63     start_time = time.time()
64    
65     train_loss, train_acc = train(rnn, train_iterator, optimizer, criteon)
66     dev_loss, dev_acc = evaluate(rnn, dev_iterator, criteon)  
67     
68     end_time = time.time()
69     
70     epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
71     
72     if dev_acc > best_valid_acc:          #只要模型效果變好,就儲存
73         best_valid_acc = dev_acc
74         torch.save(rnn.state_dict(), 'wordavg-model.pt')
75         
76     print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
77     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
78     print(f'\t Val. Loss: {dev_loss:.3f} |  Val. Acc: {dev_acc*100:.2f}%')
79     
80 
81 #用儲存的模型引數預測資料
82 rnn.load_state_dict(torch.load("wordavg-model.pt"))   
83 test_loss, test_acc = evaluate(rnn, test_iterator, criteon)
84 print(f'Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')
85     

hidden_dim=64, n_layers=2的條件下:

當定義的模型部分只有LSTM時,準確率:78.08%

當使用2.1的Attention公式,準確率:82.46%

當使用2.2的Attention公式,準確率:81.49%

加入Attention機制,效能略有提升。