9.邏輯迴歸-下采樣、過取樣、交叉驗證

阿新 • • 發佈：2018-12-16

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
from imblearn.over_sampling import SMOTE


data = pd.read_csv('creditcard.csv')
print(data.shape)
print(data.columns)
# print(data.head(100))
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

# 歸一化
data['new_Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 丟掉某些無用列
data = data.drop(['Time', 'Amount'], axis=1)

# 初始化資料
X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

# 獲取異常樣本的個數
number_records_fraud = len(data[data.Class == 1])
# 獲取異常樣本的索引
fraud_index = np.array(data[data.Class == 1].index)

# 獲取正常樣本的個數
number_records_normal = len(data[data.Class == 0])
# 獲取正常樣本的索引
normal_index = data[data.Class == 0].index

# 下采樣，採取與樣本少的數量一樣的資料
# 隨機選擇樣本
random_normal_index = np.random.choice(normal_index, number_records_fraud, replace=False)
random_normal_index = np.array(random_normal_index)
# print(len(random_normal_index))=492

# 將隨機選擇的樣本index與fraud樣本的索引連線成一個新的array
under_sample_index = np.concatenate([random_normal_index, fraud_index])
# print(len(under_sample_index))=984

# 根據下采樣的索引獲取下采樣的資料集
under_sample_data = data.iloc[under_sample_index]
# print(len(under_sample_data))=984
X_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns != 'Class']]
y_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns == 'Class']]
# 另外一種寫法，待會驗證一下
# X_under_sample_data = under_sample_data.loc[under_sample_data.columns != 'Class']
# y_under_sample_data = under_sample_data.loc[under_sample_data.columns == 'Class']



# The whole dataset 全部資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# The under_sample dataset 下采樣資料集
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample_data, y_under_sample_data, test_size=0.3, random_state=0)



def printing_Kfold_scores(x_train_data, y_train_data):
    # 生成交叉驗證的引數，會得到二維列表train_index 和 test_index
    kfold = KFold(n_splits=5, shuffle=False)
    # 不同的正則項引數:懲罰力度
    c_param_range = [0.01, 0.1, 1, 10, 100]
    # fold 中有兩個列表，train_index 和 test_index
    j = 0
    for c_param in c_param_range:
        # 這裡for迴圈是為了使用不同的懲罰力度來初始化正則項
        print('-----------------------------------')
        print('C Parameter:', c_param)
        print('-----------------------------------')
        print('')
        recall_accs = []
        for iteration, index in enumerate(kfold.split(x_train_data), start=1):
            # for迴圈裡面是使用5次交叉驗證訓練
            # 使用懲罰力度呼叫邏輯迴歸模型
            # 模型初始化
            lr = LogisticRegression(C = c_param, penalty = 'l1')
            # 訓練模型
            lr.fit(x_train_data.iloc[index[0], :].values, y_train_data.iloc[index[0], :].values.ravel())
            # 用訓練的模型預測資料
            y_predicted_undersample = lr.predict(x_train_data.iloc[index[1], :].values)

            recall_acc = recall_score(y_train_data.iloc[index[1], :].values, y_predicted_undersample)
            recall_accs.append(recall_acc)
            print('Iteration:', iteration, ': Recall Score = ', recall_acc)
        print('Mean Recall Score:',np.mean(recall_accs))


# y_predicted_undersample = printing_Kfold_scores(X_train_undersample, y_train_undersample)
# y_predicted_undersample = printing_Kfold_scores(X, y_train_undersample)

kfold = KFold(n_splits=5, shuffle=False)
recall_accs = []
for iteration, indexs in enumerate(kfold.split(X_train_undersample), start=1):
    lr = LogisticRegression(C=0.01, penalty='l1')
    lr.fit(X_train_undersample.iloc[indexs[0], :].values, y_train_undersample.iloc[indexs[0], :].values.ravel())
    # 預測下采樣資料
    # y_predicted_labels = lr.predict(X_test_undersample.values)
    # recall_acc = recall_score(y_test_undersample, y_predicted_labels)
    # 預測所有資料
    y_predicted_labels = lr.predict(X_test.values)
    recall_acc = recall_score(y_test, y_predicted_labels)
    # 預測過取樣資料
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    # over_sampler = SMOTE(random_state=0)
    # os_X, os_y = over_sampler.fit_sample(X_train, y_train)
    # y_predicted_labels = lr.predict(X_test.values)
    # recall_acc = recall_score(y_test, y_predicted_labels)

    print('Recall:',recall_acc)
    recall_accs.append(recall_acc)
print('Recall Means:', np.mean(recall_accs))

9.邏輯迴歸-下采樣、過取樣、交叉驗證

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model

10.邏輯迴歸-下采樣、過取樣、交叉驗證

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection

降取樣，過取樣，欠取樣，子取樣，下采樣，上取樣，你學會了嗎？【總結】

降取樣：2048HZ對訊號來說是過取樣了，事實上只要訊號不混疊就好（滿足尼奎斯特取樣定理），所以可以對過取樣的訊號作抽取，即是所謂的“降取樣”。在現場中取樣往往受具體條件的限止，或者不存在300HZ的取樣率，或除錯非常困難等等。若R>>1，則Rfs/2就遠大於音

降取樣，過取樣，欠取樣，子取樣，下采樣，上取樣

取樣： 2048HZ對訊號來說是過取樣了，事實上只要訊號不混疊就好（滿足尼奎斯特取樣定理），所以可以對過取樣的訊號作抽取，即是所謂的“降取樣”。在現場中取樣往往受具體條件的限止，或者不存在300HZ的取樣率，或除錯非常困難等等。若 R>>1，則Rfs/2就

Imblearn package study（不平衡資料處理之過取樣、下采樣、綜合取樣）

Imblearn package study 1. 準備知識 Sparse input For sparse input the data is converted to the Compressed Sparse Rows r

時間序列--上取樣、下采樣

在上取樣的情況下，可能需要注意如何使用插值來計算細粒度的觀測值在向下取樣的情況下，在選擇用於計算新聚合值的彙總統計資訊時可能需要小心。也許有兩個主要原因讓你對重新取樣你的時間序列資料感興趣: 1.問題框架:如果您的資料與您希望進行預測的頻率相同，則可能需要重新取樣。 2.特徵工程

OpenCV-Python——上取樣、下采樣與拉普拉斯金字塔

影象金字塔（也叫高斯金字塔）：同一影象不同分辨律的子圖集合。向下取樣的過程：從Gi得到Gi+1的過程: 1.對影象Gi進行高斯卷積。 2.刪除所有行和列。向上取樣的過程：從Gi得到Gi-1的過程： 1.行和列擴充套件為原來的兩倍，用0填充。 2.使用

金字塔向上、下采樣(圖片的大小轉換)

程式碼： OpenFileDialog of = new OpenFileDialog(); if (of.ShowDialog() != DialogResult.OK) { return; }

資料不平衡：下采樣、上取樣python程式碼實現

一、下采樣所有資料存在DataFrame物件df中。資料分為兩類：多數類別和少數類別，資料量相差大。資料預處理已將多數類別的Label標記為1，少數類別的Label標記為0。 import numpy as np import pandas as pd def lo

0021-用OpenCV的pyrUp和pyrDown函式計算影象金字塔(向上/下采樣)

影象金字塔是一個影象集合，集合中所有的影象都源於同一個原始影象，通過對原始影象進行連續取樣得到影象集合。常見的有兩種影象金字塔，即高斯金字塔和拉普拉斯金字塔。高斯金字塔：向下降取樣影象。金字塔從i層生成第i+1層，先用高斯核對Gi進行卷積，然後，刪除所有偶數行和偶數列。這樣，新得到的影象面積會變為源

python_bicubic_下采樣獲得LR

如何獲得LR影象 image = imread(path, is_grayscale=True) label_ = modcrop(image, scale) # Must be normalized image = image / 255. label_ = label_ / 25

下采樣（處理資料不平衡問題）

import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.preprocessing import StandardScaler#去均值，方差歸一化，類似於特徵縮放 from sklearn

解決U-net上取樣過程後，結合下采樣資訊時特徵圖大小不匹配問題

在U-net下采樣後時，通過polling層，可能會出現這種情況，37*37feature maps 壓縮成18*18大小，但在上取樣過程中，利用 nn.ConvTranspose2d()通常變為36*36大小的feature maps,不同大小的feature maps在進行concat時會報

下采樣方法

.loc[],中括號裡面是先行後列，以逗號分割，行和列分別是行標籤和列標籤(label) .iloc[]與loc一樣，中括號裡面也是先行後列，行列標籤用逗號分割，與loc不同的之處是，.iloc 是根據行數與列數來索引的 .ix上面兩種用法都可以 X=

影象的上取樣（upsampling）與下采樣（subsampled）

參考： http://blog.csdn.net/majinlei121/article/details/46742339 http://blog.csdn.net/augusdi/article/details/9028365 縮小影象（或稱為下

opencv013-影象上取樣和下采樣（+高斯不同）

影象金字塔概念： 1. 我們在影象處理中常常會調整影象大小，最常見的就是放大(zoom in)和縮小（zoom out），儘管幾何變換也可以實現影象放大和縮小，但是這裡我們介紹影象金字塔 2. 一個影象金字塔式一系列的影象組成，最底下一張是影象尺寸最大，最上方的影象尺寸最

上取樣與下采樣

通常所說的取樣指的是下采樣，也就是對訊號的抽取。其實，上取樣和下采樣都是對數字訊號進行重採，重採的取樣率與原來獲得該數字訊號（比如從模擬訊號取樣而來）的取樣率比較，大於原訊號的稱為上取樣，小於的則稱為下采樣。上取樣的實質也就是內插或插值。下采樣的定義：對於一個樣值序列間隔幾個樣值取樣一次，這樣

影象的上取樣和下采樣

影象的上取樣（upsampling）與下采樣（subsampled）縮小影象（或稱為下采樣（subsampled）或降取樣（downsampled））的主要目的有兩個： 1、使得影象符合顯示區域的大小；2、生成對應影象的縮圖。放大影象（或稱為上取樣（upsampling）或影象插值（

PCL使用VoxelGrid filter對點雲進行下采樣

#include <iostream> #include <pcl/io/pcd_io.h> #include <pcl/point_types.h> #includ

Glide4.8原始碼拆解（四）Bitmap解析之"下采樣"淺析

前言 Glide歸根結底是一個圖片載入框架，它一定會涉及到BitmapFactory相關API把Bitmap讀取到記憶體；可能大家已經很熟悉如何高效的載入Bitmap(比如使用inSample等)，這一章還是要看一看Glide是如何玩轉的；本文主要分析這兩個類： DownsampleStrate

9.邏輯迴歸-下采樣、過取樣、交叉驗證

相關推薦