kaggle 泰坦尼克預測 案例
阿新 • • 發佈:2018-12-03
kaggle 泰坦尼克預測準確率79%。
#!/usr/bin/Python # -*- coding: utf-8 -*- # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir("../input")) # Any results you write to the current directory are saved as output. # importing algrebra and dataframe libraries import numpy as np import pandas as np # importing data analysis, graph libraries import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # Reading train and test data and concat them. # We are adding train and test data because a model can predict with same featues which we use train the model. train_df = pd.read_csv('../input/train.csv') test_df = pd.read_csv('../input/test.csv') titanic_df = train_df.append(test_df, ignore_index=True, sort=False) # PassengerId is a irrelevant column with our dataset so will remove this column. titanic_df.drop('PassengerId', axis=1, inplace=True) def find_title(name): """ This method takes a full name and return the title of name """ nameList = name.split() for i in nameList: if '.' in i: name = i[:-1] return name # creating a new title column in titanic_df titanic_df['Title'] = titanic_df['Name'].apply(find_title) print(titanic_df.head()) def title_class(title): if title in ['L', 'Lady', 'Sir', 'Countess', 'Mme', 'Mlle', 'Ms']: return 0 elif title in ['Don', 'Rev', 'Capt', 'Jonkheer']: return 1 elif title in ['Mrs', 'Miss']: return 2 elif title in ['Master']: return 3 elif title in ['Mr']: return 5 else: return 6 titanic_df['Title'] = titanic_df['Title'].apply(title_class) # Now we don't need the name column anymore. titanic_df.drop('Name', axis=1, inplace=True) def sex_column(sex): if sex == 'male': return 0 else: return 1 titanic_df['Sex'] = titanic_df['Sex'].apply(sex_column) def fare_class(fare): return fare // 200 titanic_df['FareClass'] = titanic_df['Fare'].apply(fare_class) import random def fill_age(columns): age = columns[0] pclass = columns[1] fareclass = columns[2] if pd.isnull(age): pclass_mean = int(round(titanic_df[titanic_df['Pclass'] == pclass]['Age'].mean())) fareclass_mean = int(round(titanic_df[titanic_df['FareClass'] == fareclass]['Age'].mean())) pclass_std = int(round(titanic_df[titanic_df['Pclass'] == pclass]['Age'].std())) fareclass_std = int(round(titanic_df[titanic_df['FareClass'] == fareclass]['Age'].std())) age_max = int(round(((pclass_mean + fareclass_mean) + (pclass_std + fareclass_std)) / 2)) age_min = int(round(((pclass_mean + fareclass_mean) - (pclass_std + fareclass_std)) / 2)) random_age = random.randint(age_min, age_max) return random_age else: return age titanic_df['Age'] = titanic_df[['Age', 'Pclass', 'FareClass']].apply(fill_age, axis=1) titanic_df['Age'] = titanic_df['Age'].apply(int) def age_class(age): if 0 <= age <= 20: return 0 elif 20 < age <= 40: return 1 elif 40 < age <= 60: return 2 else: return 3 titanic_df['AgeClass'] = titanic_df['Age'].apply(age_class) # We have to add 1 because we have to include passenger too. titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1 def alone(familysize): if familysize == 1: return 1 else: return 0 titanic_df['Alone'] = titanic_df['FamilySize'].apply(alone) def family_class(familysize): if familysize <= 3: return 0 elif 3 < familysize <= 7: return 1 else: return 2 titanic_df['FamilyClass'] = titanic_df['FamilySize'].apply(family_class) tickets = titanic_df['Ticket'].unique() tickets_int = list() tickets_str = list() for i in range(len(tickets)): try: tickets_int.append(int(tickets[i])) except: tickets_str.append(tickets[i]) def ticket_class(ticket): try: int(ticket) return 0 except: return 1 titanic_df['TicketClass'] = titanic_df['Ticket'].apply(ticket_class) titanic_df.drop('Ticket', axis=1, inplace=True) pclass_mean = int(round(titanic_df[titanic_df['Pclass'] == 3]['Fare'].mean())) pclass_std = int(round(titanic_df[titanic_df['Pclass'] == 3]['Fare'].std())) fare_min = pclass_mean - pclass_std fare_max = pclass_mean + pclass_std random_fare = random.randint(fare_min, fare_max) titanic_df.loc[titanic_df['Fare'].isnull() == True, 'Fare'] = random_fare titanic_df.loc[titanic_df['FareClass'].isnull() == True, 'FareClass'] = random_fare // 200 titanic_df['FareClass'] = titanic_df['FareClass'].apply(int) titanic_df.drop('Cabin', axis=1, inplace=True) titanic_df.loc[titanic_df['Embarked'].isnull() == True, 'Embarked'] = 'S' def embarked(embarked): embarked_dict = {'S': 0, 'C': 1, 'Q': 2} return embarked_dict[embarked] titanic_df['Embarked'] = titanic_df['Embarked'].apply(embarked) train_featured = titanic_df.iloc[:891] test_featued = titanic_df.iloc[891:] train_featured_copy = train_featured test_featued_copy = test_featued # First split our train data as train and test data to see accuract values. # Sklearn has train_split for dividing dataset and shuffle it. from sklearn.model_selection import train_test_split train_df = train_featured_copy X = train_df.drop(['Survived'], axis=1) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Some algorithms from sklearn to classification. # Actually, I didn't use many algorithms because they are almost all will give similar result. # Because, the important thing is create a good featured dataset. If we have a good classifiable dataset # mostly all algorithm will give similar result. from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report, confusion_matrix from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) logmodel = LogisticRegression() logmodel.fit(X_train, y_train) log_predictions = logmodel.predict(X_test) print(classification_report(y_test, log_predictions)) svm_model = SVC() svm_model.fit(X_train, y_train) svm_predictions = svm_model.predict(X_test) print(classification_report(y_test, svm_predictions)) rdm = RandomForestClassifier() rdm.fit(X_train, y_train) rdm_predictions = rdm.predict(X_test) print(classification_report(y_test, rdm_predictions)) param_grid = {'C' : [1, 10, 100, 1000, 10000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001]} grid = GridSearchCV(SVC(), param_grid, refit=True) grid.fit(X_train, y_train) grid_predictions = grid.predict(X_test) print(classification_report(y_test, grid_predictions)) gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_pred = gbc.predict(X_test) print(classification_report(y_test, gbc_pred)) import itertools import time start = time.time() y = train_df['Survived'] columns = list(train_df.columns) columns.remove('Survived') print(len(columns) - 8) for k in range(0, len(columns) - 8): features = list(itertools.combinations(columns, k)) score_max = 0 score_index = list() for i in range(len(features)): features_extra = list(features[i]) features_extra.append('Survived') X = train_df.drop(features_extra, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # logmodel = LogisticRegression() # logmodel.fit(X_train, y_train) # log_predictions = logmodel.predict(X_test) # svm_model = SVC() # svm_model.fit(X_train, y_train) # svm_predictions = svm_model.predict(X_test) # rdm = RandomForestClassifier() # rdm.fit(X_train, y_train) # rdm_predictions = rdm.predict(X_test) gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_predictions = gbc.predict(X_test) scores = dict() # scores[accuracy_score(y_test, log_predictions)] = "Logistic Regression: " # scores[accuracy_score(y_test, svm_predictions)] = "SVM: " # scores[accuracy_score(y_test, rdm_predictions)] = "Random Forest Classifier: " scores[accuracy_score(y_test, gbc_predictions)] = "GradientBoosting Classifier: " sorted_scores = sorted(scores, reverse=True) if score_max < max(sorted_scores): score_index = list() score_index.append(max(score_max, max(sorted_scores))) score_index.append(i) score_max = max(score_max, max(sorted_scores)) # print("------------------------------------Test", i, '---------------------------------') # print() # for j in sorted_scores: # print(scores[j], j) # print() print("------------------------------------ Extra Feature", k, '---------------------------------') print("Extra Feature Count: ", k, "\nMax Score:", score_index[0], "\nFeatue Index: ", score_index[1]) print() end = time.time() time_comb = end - start print("Time for Combinations of Features: ", time_comb) features = list(itertools.combinations(columns, 5)) extra_features = list(features[1369]) extra_features.append('Survived') train_df = train_df X = train_df.drop(extra_features, axis=1) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) logmodel = LogisticRegression() logmodel.fit(X_train, y_train) log_predictions = logmodel.predict(X_test) print(classification_report(y_test, log_predictions)) svm_model = SVC() svm_model.fit(X_train, y_train) svm_predictions = svm_model.predict(X_test) print(classification_report(y_test, svm_predictions)) rdm = RandomForestClassifier() rdm.fit(X_train, y_train) rdm_predictions = rdm.predict(X_test) print(classification_report(y_test, rdm_predictions)) param_grid = {'C' : [1, 10, 100, 1000, 10000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001]} grid = GridSearchCV(SVC(), param_grid, refit=True) grid.fit(X_train, y_train) grid_predictions = grid.predict(X_test) print(classification_report(y_test, grid_predictions)) gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_pred = gbc.predict(X_test) print(classification_report(y_test, gbc_pred)) train_df = train_df X = train_df.drop(extra_features, axis=1) y = train_df['Survived'] gbc.fit(X, y) X_test = test_featued.drop(extra_features, axis=1) gbc_predictions = gbc.predict(X_test) gbc_predictions = pd.DataFrame(gbc_predictions, columns=['Survived']) gbc_predictions['Survived'] = gbc_predictions['Survived'].apply(int) gbc_predictions.set_index(test_df['PassengerId'], inplace=True) gbc_predictions.to_csv('submission.csv')