sklearn 劃分資料集
阿新 • • 發佈:2018-11-27
#-*- coding: UTF-8 -*- from sklearn.model_selection import train_test_split def split(dataset, labelset, test_size, train_savefile, test_savefile): # split into training set and test set x_train, x_test, y_train, y_test = train_test_split(dataset, labelset, test_size=test_size, random_state=42, stratify=labelset ) savetxt(train_savefile, x_train) savetxt(test_savefile, x_test) return x_train, x_test def savetxt(path, np_array): with open(file=path, mode='w', encoding='utf-8') as fw: fw.writelines(np_array) def reader_data(datafile): data_list = [] with open(datafile, mode='r', encoding='utf-8') as f: for line in f: data_list.append(line) return data_list if __name__ == '__main__': datafile = 'data/output/tra-set0603_0.9' dataset = reader_data(datafile) label_file = 'data/output/tra-set0603_0.9_label' labelset = reader_data(label_file) test_size = 0.2 train_savefile = 'data/output/raw_0.9/raw_train.txt' test_savefile = 'data/output/raw_0.9/raw_test.txt' split(dataset, labelset, test_size, train_savefile, test_savefile)