基於Huggingface使用BERT進行文字分類的fine-tuning
阿新 • • 發佈:2020-10-07
隨著BERT大火之後,很多BERT的變種,這裡借用Huggingface工具來簡單實現一個文字分類,從而進一步通過Huggingface來認識BERT的工程上的實現方法。
1、load data
train_df = pd.read_csv('../data/train.tsv',delimiter='\t',names=['text','label']) print(train_df.shape) train_df.head()
sentences = list(train_df['text'])
targets =train_df['label'].values
2、token encodding
#如果token要封裝到自定義model類中的話,則需要指定max_len tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') max_length=32 sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt') targets=torch.tensor(targets)
3、encoding data
# from torchvision import transforms,datasets from torch.utils.data import Dataset,DataLoader,random_split class DataToDataset(Dataset): def __init__(self,encoding,labels): self.encoding=encoding self.labels=labels def __len__(self): return len(self.labels) def __getitem__(self,index): return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index] #封裝資料 datasets=DataToDataset(sentences_tokened,targets) train_size=int(len(datasets)*0.8) test_size=len(datasets)-train_size print([train_size,test_size]) train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size]) BATCH_SIZE=64 #這裡的num_workers要大於0 train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5) val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)#
4、create model
class BertTextClassficationModel(nn.Module): def __init__(self): super(BertTextClassficationModel,self).__init__() self.bert=BertModel.from_pretrained('bert-base-uncased') self.dense=nn.Linear(768,2) #768 input, 2 output def forward(self,ids,mask): out,_=self.bert(input_ids=ids,attention_mask=mask) out=self.dense(out[:,0,:]) return out mymodel=BertTextClassficationModel() #獲取gpu和cpu的裝置資訊 device=torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device=",device) if torch.cuda.device_count()>1: print("Let's use ",torch.cuda.device_count(),"GPUs!") mymodel=nn.DataParallel(mymodel) mymodel.to(device)
5、train model
loss_func=nn.CrossEntropyLoss() optimizer=optim.Adam(mymodel.parameters(),lr=0.0001) from sklearn.metrics import accuracy_score def flat_accuracy(preds,labels): pred_flat=np.argmax(preds,axis=1).flatten() labels_flat=labels.flatten() return accuracy_score(labels_flat,pred_flat) epochs=3 for epoch in range(epochs): train_loss = 0.0 train_acc=0.0 for i,data in enumerate(train_loader): input_ids,attention_mask,labels=[elem.to(device) for elem in data] #優化器置零 optimizer.zero_grad() #得到模型的結果 out=mymodel(input_ids,attention_mask) #計算誤差 loss=loss_func(out,labels) train_loss += loss.item() #誤差反向傳播 loss.backward() #更新模型引數 optimizer.step() #計算acc out=out.detach().numpy() labels=labels.detach().numpy() train_acc+=flat_accuracy(out,labels) print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))
6、evaluate
print("evaluate...") val_loss=0 val_acc=0 mymodel.eval() for j,batch in enumerate(val_loader): val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch] with torch.no_grad(): pred=mymodel(val_input_ids,val_attention_mask) val_loss+=loss_func(pred,val_labels) pred=pred.detach().cpu().numpy() val_labels=val_labels.detach().cpu().numpy() val_acc+=flat_accuracy(pred,val_labels) print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))
&n