《Python機器學習及實踐》----模型實用技巧
本片部落格是根據《Python機器學習及實踐》一書中的例項,所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的,或者是sklearn自帶資料下載到本地使用的。
程式碼片段:
measurements = [{'city': 'Dubai','temperature': 33},{'city': 'London','temperature': 12.},{'city': 'San Fransisco','temperature': 18.}]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
print vec.fit_transform(measurements).toarray()
print vec.get_feature_names()
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25 , random_state = 33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count = MultinomialNB()
mnb_count.fit(X_count_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (CountVectorizer without filtering stopwords):',mnb_count.score(X_count_test,y_test)
y_count_predict = mnb_count.predict(X_count_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_predict,target_names=news.target_names)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_tfidf_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (TfidfVectorizer without filtering stopwords):',mnb_tfidf.score(X_tfidf_test,y_test)
y_tfidf_predict = mnb_tfidf.predict(X_tfidf_test)
print classification_report(y_test,y_tfidf_predict,target_names=news.target_names)
count_filter_vec,tfidf_filter_vec = CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
X_count_filter_train = count_filter_vec.fit_transform(X_train)
X_count_filter_test = count_filter_vec.transform(X_test)
X_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test = tfidf_filter_vec.transform(X_test)
mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (CountVectorizer by filtering stopwords):',mnb_count_filter.score(X_count_filter_test,y_test)
y_count_filter_predict = mnb_count_filter.predict(X_count_filter_test)
mnb_tfidf_filter = MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print 'The accuracy of classifying 20newsgroups using Navie Bayes (TfidfVectorizer by filtering stopwords):',mnb_tfidf_filter.score(X_tfidf_filter_test,y_test)
y_tfidf_filter_predict = mnb_tfidf_filter.predict(X_tfidf_filter_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_filter_predict,target_names=news.target_names)
print classification_report(y_test,y_tfidf_filter_predict,target_names=news.target_names)
import pandas as pd
titanic = pd.read_csv('D:\Source Code\machinelearn\\titanic.txt')
y = titanic['survived']
X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
X['age'].fillna(X['age'].mean(), inplace=True)
X.fillna('UNKNOWN', inplace=True)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
from sklearn.cross_validation import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
print results
opt = np.where(results == results.max())[0]
print 'Optimal number of features %d' %percentiles[opt]
import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
import numpy as np
xx = np.linspace(0, 26, 100)
xx = xx.reshape(xx.shape[0], 1)
yy = regressor.predict(xx)
import matplotlib.pyplot as plt
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label="Degree=1")
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1])
plt.show()
print 'The R-squared value of Linear Regressor performing on the training data is', regressor.score(X_train, y_train)
from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_train_poly2, y_train)
xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label='Degree=1')
plt2, = plt.plot(xx, yy_poly2, label='Degree=2')
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2])
plt.show()
print 'The R-squared value of Polynominal Regressor (Degree=2) performing on the training data is', regressor_poly2.score(X_train_poly2, y_train)
from sklearn.preprocessing import PolynomialFeatures
poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)
regressor_poly4 = LinearRegression()
regressor_poly4.fit(X_train_poly4, y_train)
xx_poly4 = poly4.transform(xx)
yy_poly4 = regressor_poly4.predict(xx_poly4)
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label='Degree=1')
plt2, = plt.plot(xx, yy_poly2, label='Degree=2')
plt4, = plt.plot(xx, yy_poly4, label='Degree=4')
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2, plt4])
plt.show()
print 'The R-squared value of Polynominal Regressor (Degree=4) performing on the training data is',regressor_poly4.score(X_train_poly4, y_train)
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
regressor.score(X_test, y_test)
X_test_poly2 = poly2.transform(X_test)
regressor_poly2.score(X_test_poly2, y_test)
X_test_poly4 = poly4.transform(X_test)
regressor_poly4.score(X_test_poly4, y_test)
from sklearn.linear_model import Lasso
lasso_poly4 = Lasso()
lasso_poly4.fit(X_train_poly4, y_train)
print lasso_poly4.score(X_test_poly4, y_test)
print lasso_poly4.coef_
print regressor_poly4.score(X_test_poly4, y_test)
print regressor_poly4.coef_
print regressor_poly4.coef_
print np.sum(regressor_poly4.coef_ ** 2)
from sklearn.linear_model import Ridge
ridge_poly4 = Ridge()
ridge_poly4.fit(X_train_poly4, y_train)
print ridge_poly4.score(X_test_poly4, y_test)
print ridge_poly4.coef_
print np.sum(ridge_poly4.coef_ ** 2)
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)
time_= gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1)
time_= gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)
相關推薦
《Python機器學習及實踐》----模型實用技巧
本片部落格是根據《Python機器學習及實踐》一書中的例項,所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的,或者是sklearn自帶資料下載到本地使用的。 程式碼片段: measurements = [{'city': 'Dubai',
Python機器學習及實踐——基礎篇7(分類整合模型)
常言道:“一個籬笆三個樁,一個好漢三個幫”。整合分類模型便是綜合考量多個分類器的預測結果,從而做出決策。只是這種“綜合考量”的方式大體上分為兩種: 一種是利用相同的訓練資料同時搭建多個獨立的分類模型,然後通過投票的方式,以少數服從多數的原則作出最終的分類決策。比
《Python機器學習及實踐》----監督學習經典模型
本片部落格是根據《Python機器學習及實踐》一書中的例項,所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的,或者是sklearn自帶資料下載到本地使用的。 程式碼片段: import pandas as pd import numpy as
[python機器學習及實踐(6)]Sklearn實現主成分分析(PCA)
相關性 hit 變量 gray tran total 空間 mach show 1.PCA原理 主成分分析(Principal Component Analysis,PCA), 是一種統計方法。通過正交變換將一組可能存在相關性的變量轉換為一組線性不相關的變量,轉換後的這組
重回機器學習-《python機器學習及實踐》讀書筆記二
一.三個率 機器學習模型訓練好之後,會在樣本外進行測試,然後我們可以得到三個“率”: 準確率 召回率 精確率 其實這些也沒有什麼大不了的,大家如果學習
PYTHON機器學習及實踐_從零開始通往KAGGLE競賽之路pdf
【下載地址】 本書面向所有對機器學習與資料探勘的實踐及競賽感興趣的讀者,從零開始,以Python程式語言為基礎,在不涉及大量數學模型與複雜程式設計知識的前提下,逐步帶領讀者熟悉並且掌握當下最流行的機器學習、數learn作為基礎機器學習工具;第3章進階篇,涉及怎樣藉助高階技術或者模型進一步提升既有機器學習系統的
python機器學習及實踐學習筆記1-如何開啟ipynb字尾檔案
python機器學習及實踐學習筆記1-如何開啟ipynb字尾檔案 2017年02月22日 14:58:08 hustzhoutian 閱讀數:45365更多 個人分類: 深度學習 需要安裝ipython notebook,如果你已經安裝Anaconda
Python機器學習及實踐——基礎篇11(迴歸樹)
迴歸樹在選擇不同特徵作為分裂節點的策略上,與基礎篇6的決策樹的思路類似。不同之處在於,迴歸樹葉節點的資料型別不是離散型,而是連續型。決策樹每個葉節點依照訓練資料表現的概率傾向決定了其最終的預測類;而回歸樹的葉節點確實一個個具體的值,從預測值連續這個意義上嚴格地講,迴歸樹不能成
《Python機器學習及實踐》----無監督學習之資料聚類
本片部落格是根據《Python機器學習及實踐》一書中的例項,所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的,或者是sklearn自帶資料下載到本地使用的。 程式碼片段: # coding: utf-8 # 分別匯入numpy、matplot
Python機器學習及實踐——基礎篇10(K近鄰迴歸)
在基礎篇5中提到裡這類模型不需要訓練引數的特點。在迴歸任務重,k近鄰(迴歸)模型同樣只是藉助周圍K個最近訓練樣本的目標數值,對待測樣本的迴歸值進行決策。自然,也衍生出衡量待測樣吧迴歸值的不同方式,即到底是對K個近鄰目標數值使用普通的算術平均演算法,還是同時考慮距離的差
python機器學習及實踐 第二章的2.1.2.1線性迴歸器程式報錯Reshape your data either using array.reshap(-1,1)的原因及解決方法
最近在看Python機器學習及實踐(從零開始kaggle競賽之路)這本書,到了第二章的線性迴歸器的GradientBoostingRegressor模型照著敲程式碼的時候 出現了以下的錯誤 出錯的問題在於標準化函式這裡。 可見fit_tran
《python機器學習及實踐-從零開始通往kaggle競賽之路(程式碼Python 3.6 版)》chapter1.1
import pandas as pd #匯入pandas 庫 df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv') #讀取目錄下的資料,如果程式碼與檔案路徑不在一起,則需要另行設定 df_test = pd
Python機器學習及實踐+從零開始通往Kaggle競賽之路
提升 google 技巧 pythonh href site 幫助 target panda 內容簡介 本書面向所有對機器學習與數據挖掘的實踐及競賽感興趣的讀者,從零開始,以Python編程語言為基礎,在不涉及大量數學模型與復雜編程知識的前提下,逐
Python機器學習演算法實踐——k均值聚類(k-means)
一開始的目的是學習十大挖掘演算法(機器學習演算法),並用編碼實現一遍,但越往後學習,越往後實現編碼,越發現自己的編碼水平低下,學習能力低。這一個k-means演算法用Python實現竟用了三天時間,可見編碼水平之低,而且在編碼的過程中看了別人的編碼,才發現自己對
Python機器學習演算法實踐——梯度上升演算法
一:理論部分 給定一個樣本集,每個樣本點有兩個維度值(X1,X2)和一個類別值,類別只有兩類,我們以0和1代表。資料如下所示: 樣本 X1 X2 類別 1
Python機器學習演算法實踐——二分k-均值演算法
二分k-均值演算法步驟: 首先將所有點作為一個屬,然後將該簇-分為二,之 後選擇其中-個簇進續進行劃分,選擇哪一個簇進行劃取決於對其劃分是否可以最大程度降低SSE的值,上述基於SSE的別分過程不斷重複,直到得到使用者指定的屬數目為止, 將所有點看成一個簇 當簇數目小於k
Python機器學習及NLP庫
機器學習方面: Scikit-Learn 可用於分類、特徵選擇、特徵提取和聚集。還擁有自然語言處理特徵提取的能力、詞袋、tf-idf演算法、預處理等。 Matplotlib 可以用於快速視覺化。 Statsmodels 主要用於預測性和探索性分析。可以擬合線性模型,進行統計
Python-機器學習 入門及技巧總結
隨著這兩年人工智慧的快速發展,機器學習與深度學習行業炙手可熱,對於那些想進入這個行業的同學們,小編在這裡給大家介紹一下自己的心得體會以及利用Python的一些小技巧,希望對大家有所幫助。 在機器學習方面,對於想入門的新手,首先不得不提就是斯坦福大學的Andrew Ng-
Python機器學習庫scikit-learn實踐
.get new 安裝 gis 支持 兩個 clas mod 神經網絡 一、概述 機器學習算法在近幾年大數據點燃的熱火熏陶下已經變得被人所“熟知”,就算不懂得其中各算法理論,叫你喊上一兩個著名算法的名字,你也能昂首挺胸脫口而出。當然了,算法之林雖大,但能者還是
Python機器學習實踐指南pdf
height 異常 算法 pad point spa 個性化 2.4 機票 下載地址:網盤下載 內容簡介 · · · · · ·機器學習是近年來漸趨熱門的一個領域,同時Python 語言經過一段時間的發展也已逐漸成為主流的編程語言之一。本書結合了機器學習和Python