1. 程式人生 > >kaggle練手題目Digit Recognizer

kaggle練手題目Digit Recognizer

  • 安裝kaggle工具獲取資料來源(linux 環境)
  • 採用sklearn的KNeighborsClassifier訓練資料
  • 通過K折交叉驗證來選取K值是正確率更高

1.安裝kaggle,獲取資料來源

pip install kaggle

將資料下載到目錄/data/data-test/digit_recognize/下

cd /data/data-test/digit_recognize/
kaggle competitions download -c digit-recognizer

2.安裝anaconda3作為python3環境,自帶sklearn,pandas,numpy等常用工具包

3.程式碼實現

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pickle


# 檔案路徑
project_path = '/data/data-test/digit_recognize/'
clf_file = project_path + 'knn.pickle'


def get_data_chunk(file_name):
    # 檔案太大分塊讀取檔案 9000萬條
    reader = pd.read_csv(file_name, iterator=True)
    loop = True
    chunk_size = 100000
    chunks = []
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)
            chunks.append(chunk)
            print(len(chunks))
        except StopIteration:
            loop = False
            print("Iteration is stopped.")
    res = pd.concat(chunks, ignore_index=True)
    return res


def save_clf(clf_s):
    clf_f = open(clf_file, 'wb')
    pickle.dump(clf_s, clf_f)
    clf_f.close()


def get_clf():
    clf_f = open(clf_file, 'rb')
    res = pickle.load(clf_f)
    return res

# 對測試資料集預測結果
def predict():
    knn_clf = get_clf()
    test_data = get_data_chunk(project_path + "test.csv")
    res_data = knn_clf.predict(test_data)
    df = pd.DataFrame()
    df["imageId"] = test_data["imageId"]
    df["Label"] = res_data
    df.to_csv(project_path + 'res.csv', index=False)


def train():
    train_data = get_data_chunk(project_path + "train.csv")
    print(train_data.info())
    print(train_data)
    train_lable = train_data['label']
    x = train_data.drop(columns=['label'])

    max = 0
    max_k = 5

    # k取值從5,15用K折交叉驗證算出正確率分數
    for k in range(5, 15):
        clf = KNeighborsClassifier(n_neighbors=k)
        # cv為2折
        scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy')
        mean = scores.mean()
        print(k, mean)
        if mean > max:
            max_k = k
    print("maxK=", max_k)
    # 用max_k作為knn引數訓練模型
    clf = KNeighborsClassifier(n_neighbors=max_k)
    clf.fit(x, train_lable)
    # 儲存模型到pickle檔案
    save_clf(clf)
    
if __name__ == '__main__':
    train()
    predict()