《統計學習方法》1——邏輯斯蒂迴歸
阿新 • • 發佈:2019-01-27
1. 模型
二項邏輯斯蒂迴歸模型是由邏輯斯蒂分佈的條件概率分佈表示的分類模型。
邏輯斯蒂分佈函式為
其中,u是位置引數,是中心點(對稱點)的位置;r是形狀函式。
其分佈圖形是一條S形曲線,也就是sigmoid曲線,S形曲線的範圍是(0,1)。
二項邏輯斯蒂迴歸模型的輸出只有0和1兩個值,即一件事情發生或者不發生。定義基於x,事件發生的概率分佈如下:
由於是二值的,當事件發生的概率為p,那麼不發生的概率為1-p,因此
上面兩個概率分佈為二項邏輯斯蒂迴歸模型的分佈。X是輸入,Y是輸出,只有0和1兩個取值, w.x是w和x的內積,即對應分量相乘後相加。其中,P(Y=1|x)的分子和分母同除
為了方便,將w和x做擴充,將偏執統一進w和x中,擴充後,, 。得到
其中,w.x近似於一個線性迴歸。
2. 模型引數估計
對於訓練資料集,其中,,y的取值為0或者1。令。
概率的對數似然函式為:
將對數似然函式作為目標函式,其梯度為:
對於批梯度下降,以上梯度可以直接於梯度下降,對於隨機梯度下降,以上N可為1,且樣本隨機抽取。
根據以上理論,程式實現如下:
class logistic_regression(object):
def __init__(self, init_w_b,
input_size,
input_data_path,
input_label_path,
output_path,
init_learningRate = 1e-3,
k = 8,
iter = 20000,
batch_size=8):
self.w_b = init_w_b
self.input_size = input_size
self.init_learningRate = init_learningRate
self.batch_size = batch_size
self.one_vector = np.array([1 ] * batch_size)
self.input_data_path = input_data_path
self.input_label_path = input_label_path
self.K = k #每k份資料抽取一份驗證資料。
self.output_path = output_path
self.iter = iter
"""***************************
exp(w.x+b)
p(Y=1|X) = ---------------
1+exp(w.x+b)
w.x的sigmoid函式
***************************"""
def sigmoid(self, x):
w_x = np.dot(self.w_b, x.transpose())
exp_w_x = np.exp(w_x)
p = exp_w_x / (self.one_vector + exp_w_x)
return p
"""*****************************************
L = y*(w.x)-log(1+exp(w.x)
*****************************************"""
def Log_likelihood(self, p, y):
one_y = np.array([1] * self.batch_size, dtype=float) - y
one_p = np.array([1] * self.batch_size) - p
log_p = np.log(p)
log_one_p = np.log(one_p)
L_array = np.multiply(y, log_p) + np.multiply(one_y, log_one_p)
L = 0.0
for l in L_array:
L = L + l
L = L / self.batch_size
return L
"""****************************************
dL xj exp(w.x)
-- = y*xj - ---------------- = xj(y-p)
dwj 1 + exp(w.x)
其中,w.x表示w和x的內積。
***************************************"""
def Jaccobian(self, x, y, p):
delta_w = np.dot(x.transpose(), (y - p))
delta_w /= self.batch_size
"""
print p
print y
print delta_w
"""
return delta_w
def optimizer(self,delta_w, learning_rate):
self.w_b -= learning_rate * delta_w
#print self.w_b
def accurancy(self, p, y):
sum_correct = 0.0
for i in range(self.batch_size):
y_ = 0
if p[i] > 0.6:
y_ = 1
if y_ == y[i]:
sum_correct += 1.0
sum_correct /= self.batch_size
return sum_correct
def split_data(self):
train_x = []
test_x = []
train_y = []
test_y = []
x_lines = []
y_lines = []
with open(self.input_data_path, 'r') as f1:
x_lines = f1.readlines()
with open(self.input_label_path, 'r') as f1:
y_lines = f1.readlines()
counter = 0
test_num = 0
length = min(len(x_lines), len(y_lines))
for i in range(length):
Xis = x_lines[i].split(', ')
yis = y_lines[i].strip()
if counter % self.K == 0:
test_num = random.randint(0, self.K)
counter = 0
if counter == test_num:
test_x.append([int(Xis[0][1:]), float(Xis[1][1:]), float(Xis[2][:-2]), 1.0])
test_y.append(int(yis))
else:
train_x.append([int(Xis[0][1:]), float(Xis[1][1:]), float(Xis[2][:-2]), 1.0])
train_y.append(int(yis))
counter += 1
"""資料歸一化"""
train_x = np.array(train_x)
test_x = np.array(test_x)
for i in range(self.input_size - 1):
divided_max_value = 1 / max(train_x[:,i])
train_x[:,i] *= divided_max_value
test_x[:, i] *= divided_max_value
train_x = list(train_x)
return train_x, train_y, test_x, test_y
def train(self, train_x, train_y):
length = len(train_x)
learning_rate = self.init_learningRate
acc_sum = 0
for j in range(self.iter):
rand_index_list = random.sample(range(length), self.batch_size)
train_x_batch = []
train_y_batch = []
for i in rand_index_list:
train_x_batch.append(train_x[i])
train_y_batch.append(train_y[i])
train_x_array = np.array(train_x_batch, dtype=float)
train_y_array = np.array(train_y_batch, dtype=float)
#train_x_array = np.array([train_x_batch.append(train_x[i]) for i in rand_index_list])
#train_y_array = np.array([train_y_batch.append(train_y[i]) for i in rand_index_list])
p = self.sigmoid(train_x_array)
delta_w = self.Jaccobian(train_x_array, train_y_array, p)
self.optimizer(delta_w, learning_rate)
L = self.Log_likelihood(p, train_y_array)
acc = self.accurancy(p, train_y_array)
acc_sum += acc * acc
if j % 1000 == 0:
if learning_rate > 1e-5:
learning_rate -= (0.1 * (j/1000))
acc_mean = math.sqrt(acc_sum) / 1000
acc_sum = 0
print self.w_b
print "第%d次迭代,對數似然值為%.3f, 準確度為%.3f"%(j, L, acc)
#plt.plot(L, i, '', marker='o', ms=20, c='r', label="對數似然估計的變化曲線")
def offline_test(self, test_x, test_y):
length = len(test_x)
rand_index_list = random.sample(range(length), self.batch_size)
test_x_batch = []
test_y_batch = []
for i in rand_index_list:
test_x_batch.append(train_x[i])
test_y_batch.append(train_y[i])
test_x_array = np.array(test_x_batch, dtype=float)
test_y_array = np.array(test_y_batch, dtype=float)
p = self.sigmoid(test_x_array)
print test_x_array
print test_y_array
print p
sum_correct = 0
for i in range(self.batch_size):
y_ = 0
if p[i] > 0.6:
y_ = 1
if y_ == test_y_array[i]:
sum_correct += 1
sum_correct /= self.batch_size
print sum_correct
return sum_correct
3. 多項式擴充套件
在實踐過程中,發現用特徵的線性函式(w.x)沒有辦法很好的做邏輯擬合。在沒有更好的特徵可以替換的時候,可以考慮把特徵做多項式擴充套件。Sklearn庫中的PolynomialFeatures類,可以做多項式擴充套件,可將如(a,b,c)這樣的3維特徵,做2階擴充套件,得到這樣的10維特徵。
poly = PolynomialFeatures(2)
train_x_extend = poly.fit_transform(train_x)
test_x_extend = poly.fit_transform(test_x)
4. 正則化
多特徵進行多項式擴充套件後,得到更多維的衍生特徵,這樣,過擬合的風險升高,因此,通常需要引入正則化。
clf = LogisticRegression(penalty='l2', solver='sag', max_iter= 2000)
clf.fit(train_x, train_y)
以上程式中,LogisticRegression的第一個引數penalty為正則方法的選擇,solver為優化方法的選擇。