Python資料分析與機器學習-Python庫分析科比生涯資料
阿新 • • 發佈:2019-01-03
原始碼下載:http://download.csdn.net/download/adam_zs/10222492
import matplotlib.pyplot as plt import pandas as pd import numpy as np '''科比生涯資料''' pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) raw = pd.read_csv("data.csv") # print(raw.shape) #(30697, 25) # print(raw.head()) ''' action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id 0 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 0 2000-01 27 18 NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1 1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 0 2000-01 22 15 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2 2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 0 2000-01 45 16 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3 3 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 0 2000-01 52 22 0.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4 4 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 0 2000-01 19 0 1.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5 ''' # shot_made_flag 是否進球 kobe = raw[pd.notnull(raw["shot_made_flag"])] # print(kobe.shape) # (25697, 25) plt.figure(figsize=(10, 10)) # alpha = 0.02 # 透明度 # plt.subplot(121) # plt.scatter(kobe["loc_x"], kobe["loc_y"], color="R", alpha=alpha) # 球場中座標 # plt.title("loc_x and loc_y") # plt.subplot(122) # plt.scatter(kobe["lon"], kobe["lat"], color="B", alpha=alpha) # 經緯度 # plt.title("lon and lat") # plt.show() # 極座標,到圓心的距離+與X軸的夾角 # raw['dist'] = np.sqrt(raw['loc_x'] ** 2 + raw['loc_y'] ** 2) # loc_x_zero = raw['loc_x'] == 0 # raw['angle'] = np.array([0] * len(raw)) # raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero]) # raw['angle'][loc_x_zero] = np.pi / 2 # raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining'] # print(kobe["action_type"].unique()) # print(kobe["combined_shot_type"].unique()) # print(kobe["shot_type"].unique()) # print(kobe["shot_type"].value_counts()) # season 賽季 # print(kobe['season'].unique()) raw['season'] = raw['season'].apply(lambda x: int(x.split("-")[1])) # print(raw['season'].unique()) # # print(kobe['team_id'].unique()) # print(kobe['team_name'].unique()) gs = kobe.groupby("shot_zone_area") # print(kobe["shot_zone_area"].value_counts()) # print(len(gs)) # print(kobe["shot_zone_area"].unique()) ''' ['Left Side(L)' 'Left Side Center(LC)' 'Right Side Center(RC)' 'Center(C)' 'Right Side(R)' 'Back Court(BC)'] ''' # print(kobe["shot_zone_basic"].unique()) ''' ['Mid-Range' 'Restricted Area' 'In The Paint (Non-RA)' 'Above the Break 3' 'Right Corner 3' 'Backcourt' 'Left Corner 3'] ''' # print(kobe["shot_zone_range"].unique()) '''['8-16 ft.' '16-24 ft.' 'Less Than 8 ft.' '24+ ft.' 'Back Court Shot']''' import matplotlib.cm as cm plt.figure(figsize=(20, 10)) def scatter_plot_by_category(feat): alpha = 0.1 gs = kobe.groupby(feat) cs = cm.rainbow(np.linspace(0, 1, len(gs))) for g, c in zip(gs, cs): plt.scatter(g[1]["loc_x"], g[1]["loc_y"], color=c, alpha=alpha) # # shot_zone_area # plt.subplot(131) # scatter_plot_by_category('shot_zone_area') # plt.title('shot_zone_area') # # # shot_zone_basic # plt.subplot(132) # scatter_plot_by_category('shot_zone_basic') # plt.title('shot_zone_basic') # # # shot_zone_range # plt.subplot(133) # scatter_plot_by_category('shot_zone_range') # plt.title('shot_zone_range') # # plt.show() drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', 'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', 'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date'] for drop in drops: raw.drop(drop, axis=1, inplace=True) # print(raw.head()) ''' action_type combined_shot_type period playoffs season shot_made_flag shot_type opponent 0 Jump Shot Jump Shot 1 0 1 NaN 2PT Field Goal POR 1 Jump Shot Jump Shot 1 0 1 0.0 2PT Field Goal POR 2 Jump Shot Jump Shot 1 0 1 1.0 2PT Field Goal POR 3 Jump Shot Jump Shot 1 0 1 0.0 2PT Field Goal POR 4 Driving Dunk Shot Dunk 2 0 1 1.0 2PT Field Goal POR ''' # print(raw['combined_shot_type'].value_counts()) # dummies_cs_type = pd.get_dummies(raw['combined_shot_type'], prefix='cs_type') # raw = pd.concat([raw, dummies_cs_type], axis=1) # raw = raw.drop("combined_shot_type", axis=1) # print(raw.head()) categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season'] for var in categorical_vars: raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1) raw = raw.drop(var, 1) print(raw.head()) train_kobe = raw[pd.notnull(raw['shot_made_flag'])] train_label = train_kobe['shot_made_flag'] train_kobe = train_kobe.drop('shot_made_flag', axis=1) test_kobe = raw[pd.isnull(raw['shot_made_flag'])] test_kobe = test_kobe.drop('shot_made_flag', 1) from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import confusion_matrix, log_loss import time import numpy as np # find the best n_estimators for RandomForestClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import KFold print('Finding best n_estimators for RandomForestClassifier...') min_score = 100000 best_n = 0 scores_n = [] range_n = np.logspace(0, 2, num=3).astype(int) for n in range_n: # 樹的個數 print("the number of trees : {0}".format(n)) t1 = time.time() rfc_score = 0. rfc = RandomForestClassifier(n_estimators=n) for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True): rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k]) # rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10 pred = rfc.predict(train_kobe.iloc[test_k]) rfc_score += log_loss(train_label.iloc[test_k], pred) / 10 scores_n.append(rfc_score) if rfc_score < min_score: min_score = rfc_score best_n = n t2 = time.time() print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1)) print(best_n, min_score) # find best max_depth for RandomForestClassifier print('Finding best max_depth for RandomForestClassifier...') min_score = 100000 best_m = 0 scores_m = [] range_m = np.logspace(0, 2, num=3).astype(int) for m in range_m: # 樹的深度 print("the max depth : {0}".format(m)) t1 = time.time() rfc_score = 0. rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n) for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True): rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k]) # rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10 pred = rfc.predict(train_kobe.iloc[test_k]) rfc_score += log_loss(train_label.iloc[test_k], pred) / 10 scores_m.append(rfc_score) if rfc_score < min_score: min_score = rfc_score best_m = m t2 = time.time() print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1)) print(best_m, min_score) plt.figure(figsize=(10, 5)) plt.subplot(121) plt.plot(range_n, scores_n) plt.ylabel('score') plt.xlabel('number of trees') plt.subplot(122) plt.plot(range_m, scores_m) plt.ylabel('score') plt.xlabel('max depth') plt.show() model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m) model.fit(train_kobe, train_label)