kaggle練手題目Digit Recognizer
阿新 • • 發佈:2018-12-15
- 安裝kaggle工具獲取資料來源(linux 環境)
- 採用sklearn的KNeighborsClassifier訓練資料
- 通過K折交叉驗證來選取K值是正確率更高
1.安裝kaggle,獲取資料來源
pip install kaggle
將資料下載到目錄/data/data-test/digit_recognize/下
cd /data/data-test/digit_recognize/
kaggle competitions download -c digit-recognizer
2.安裝anaconda3作為python3環境,自帶sklearn,pandas,numpy等常用工具包
3.程式碼實現
import pandas as pd from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier import pickle # 檔案路徑 project_path = '/data/data-test/digit_recognize/' clf_file = project_path + 'knn.pickle' def get_data_chunk(file_name): # 檔案太大分塊讀取檔案 9000萬條 reader = pd.read_csv(file_name, iterator=True) loop = True chunk_size = 100000 chunks = [] while loop: try: chunk = reader.get_chunk(chunk_size) chunks.append(chunk) print(len(chunks)) except StopIteration: loop = False print("Iteration is stopped.") res = pd.concat(chunks, ignore_index=True) return res def save_clf(clf_s): clf_f = open(clf_file, 'wb') pickle.dump(clf_s, clf_f) clf_f.close() def get_clf(): clf_f = open(clf_file, 'rb') res = pickle.load(clf_f) return res # 對測試資料集預測結果 def predict(): knn_clf = get_clf() test_data = get_data_chunk(project_path + "test.csv") res_data = knn_clf.predict(test_data) df = pd.DataFrame() df["imageId"] = test_data["imageId"] df["Label"] = res_data df.to_csv(project_path + 'res.csv', index=False) def train(): train_data = get_data_chunk(project_path + "train.csv") print(train_data.info()) print(train_data) train_lable = train_data['label'] x = train_data.drop(columns=['label']) max = 0 max_k = 5 # k取值從5,15用K折交叉驗證算出正確率分數 for k in range(5, 15): clf = KNeighborsClassifier(n_neighbors=k) # cv為2折 scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy') mean = scores.mean() print(k, mean) if mean > max: max_k = k print("maxK=", max_k) # 用max_k作為knn引數訓練模型 clf = KNeighborsClassifier(n_neighbors=max_k) clf.fit(x, train_lable) # 儲存模型到pickle檔案 save_clf(clf) if __name__ == '__main__': train() predict()