使用者房源推薦—基於內容的推薦演算法(CB)
阿新 • • 發佈:2019-02-17
CB推薦演算法根據使用者過去喜歡的產品,為使用者推薦和他過去喜歡的產品相似的產品。採用基於特徵的空間向量模型,並用最近鄰方法進行推薦。
演算法步驟:
抽取房源的基本特徵,考慮到租房的實際情況,確定的基本特徵有價格 (house_price) 面積 (house_area),房屋型別 (house_type),地區 (district)
利用一個使用者過去喜歡(及不喜歡)的房源特徵資料,來學習出此使用者的喜好特徵。將各個特徵進行分類。其中價格分為10類,面積分10類,房屋型別6類,地區9類利用歷史資料統計出每個使用者的每個特徵中各類別的次數,之後相加取平均,表示某使用者的某特徵的喜好向量。
- 價格
t1 : (house_price) ,
面積t2 : (house_area) ,
型別t3 : (house_type),
地區t4 : (district)。
使用者在實際租房時,更多的是考慮房屋的價格和地區,因此主觀確定4個特徵的權重為
則每個房源與使用者喜好的加權相識度
#coding:utf-8
import pyodbc
import time
import numpy as np
class recommend_house:
'''房源推薦演算法類'''
def house_data(self):
'''
函式功能:獲取資料庫中收藏使用者的房源資料
引數:無
返回值:所有房源資料:rent_house_info,使用者收藏房源資料:user_house_info,使用者id列表:collect_user_id
'''
cnxn = pyodbc.connect('DSN=zjx;UID=root')
cursor = cnxn.cursor()
sql = "select DISTINCT uid,h_id from test.shoucang where h_type = 1 order by uid"
cursor.execute(sql)
user_info = cursor.fetchall()
sql = "select id, house_price, house_area, house_type, district,status from test.house_rent_info_geren"
cursor.execute(sql)
rent_house_info = cursor.fetchall()
user_house_info = []
each_user_info = []
now_id = user_info[0][0]
late_id = user_info[0][0]
each_user_info.append([user_info[0][0],user_info[0][1]])
count = 0
collect_user_id = []
for item in user_info[1:]:
now_id = item[0]
if now_id == late_id:
each_user_info[count].append(item[1])
else:
count = count + 1
each_user_info.append([item[0],item[1]])
late_id = item[0]
count = 0
for item in each_user_info:
user_id = item[0]
collect_user_id.append(item[0])
flag = True
for house_id in item[1:]:
for each_house in rent_house_info:
if each_house[0] == house_id:
if flag :
user_house_info.append([each_house[1:]])
flag = False
else:
user_house_info[count].append(each_house[1:])
break
count = count + 1
return rent_house_info, user_house_info, collect_user_id
def cosine_similarity(self, vector_A, vector_B,len_vector_A):
'''
函式功能:計算兩向量的餘弦相似度
引數:向量vector_A,vector_B
返回值:兩向量的餘弦相似度
'''
for i in range(len(vector_B)):
if vector_B[i] == 1:
index = i
break
vector_inner = vector_A[index]
vector_cos = vector_inner/(len_vector_A)
return vector_cos
def price_classify(self, count1, price):
'''
函式功能:對房源價格進行分類
引數:計數count1和房源價格
返回值:計數count1
'''
if price <= 1000:
count1[0] = count1[0] + 1
elif 1000 < price <= 1500:
count1[1] = count1[1] + 1
elif 1500 < price <= 2000:
count1[2] = count1[2] + 1
elif 2000 < price <= 2500:
count1[3] = count1[3] + 1
elif 2500 < price <= 3000:
count1[4] = count1[4] + 1
elif 3000 < price <= 3500:
count1[5] = count1[5] + 1
elif 3500 < price <= 4000:
count1[6] = count1[6] + 1
elif 4000 < price <= 4500:
count1[7] = count1[7] + 1
elif 4500 < price <= 5000:
count1[8] = count1[8] + 1
else:
count1[9] = count1[9] + 1
return count1
def area_classify(self, count2, area):
'''
函式功能:對房源面積進行分類
引數:計數count2和房源面積
返回值:計數count2
'''
if area <= 20:
count2[0] = count2[0] + 1
elif 20 < area <= 30:
count2[1] = count2[1] + 1
elif 30 < area <= 40:
count2[2] = count2[2] + 1
elif 40 <area <= 50:
count2[3] = count2[3] + 1
elif 50 < area <= 60:
count2[4] = count2[4] + 1
elif 60 < area <= 70:
count2[5] = count2[5] + 1
elif 70 < area <= 80:
count2[6] = count2[6] + 1
elif 80 < area <= 90:
count2[7] = count2[7] + 1
elif 90 < area <= 100:
count2[8] = count2[8] + 1
else:
count2[9] = count2[9] + 1
return count2
def type_classify(self, count3, room_type):
'''
函式功能:對房源型別進行分類
引數:計數count3和房源型別
返回值:計數count3
'''
if room_type.find('1室') > -1:
count3[0] = count3[0] + 1
elif room_type.find('2室') > -1:
count3[1] = count3[1] + 1
elif room_type.find('3室') > -1:
count3[2] = count3[2] + 1
elif room_type.find('4室') > -1:
count3[3] = count3[3] + 1
elif room_type.find('5室') > -1:
count3[4] = count3[4] + 1
else:
count3[5] = count3[5] + 1
return count3
def district_classify(self, count4, room_district):
'''
函式功能:對房源地區進行分類
引數:計數count4和房源地區
返回值:計數count4
'''
if room_district.find('濱江') > -1:
count4[0] = count4[0] + 1
elif room_district.find('西湖') > -1:
count4[1] = count4[1] + 1
elif room_district.find('上城') > -1:
count4[2] = count4[2] + 1
elif room_district.find('下城') > -1:
count4[3] = count4[3] + 1
elif room_district.find('江干') > -1:
count4[4] = count4[4] + 1
elif room_district.find('拱墅') > -1:
count4[5] = count4[5] + 1
elif room_district.find('蕭山') > -1:
count4[6] = count4[6] + 1
elif room_district.find('餘杭') > -1:
count4[7] = count4[7] + 1
else:
count4[8] = count4[8] + 1
return count4
def count_price(self, collect_house_info):
'''
函式功能:獲取某使用者的房源價格偏好向量
引數:某使用者收藏的房源資料
返回值:某使用者的房源價格偏好向量
'''
count1 = [0]*10
vector_price = []
for item in collect_house_info:
count1 = self.price_classify(count1, item[0])
len_collect = float(len(collect_house_info))
for item in count1:
vector_price.append(item/len_collect)
return vector_price
def count_area(self, collect_house_info):
'''
函式功能:獲取某使用者的房源面積偏好向量
引數:某使用者收藏的房源資料
返回值:某使用者的房源面積偏好向量
'''
count2 = [0] *10
vector_area = []
area_null = 0
for item in collect_house_info:
try:
item = list(item)
item[1] = int(item[1])
count2 = self.area_classify(count2, item[1])
except:
area_null = area_null + 1
len_collect = float(len(collect_house_info))
for item in count2:
vector_area.append(item/(len_collect - area_null))
return vector_area
def count_type(self, collect_house_info):
'''
函式功能:獲取某使用者的房源型別偏好向量
引數:某使用者收藏的房源資料
返回值:某使用者的房源型別偏好向量
'''
count3 = [0]*6
vector_type = []
type_null = 0
for item in collect_house_info:
if item[2] == ''or item[2] == 'null':
type_null = type_null + 1
else:
count3 = self.type_classify(count3, item[2])
len_collect = float(len(collect_house_info))
for item in count3:
vector_type.append(item/(len_collect - type_null))
return vector_type
def count_district(self, collect_house_info):
'''
函式功能:獲取某使用者的房源地區偏好向量
引數:某使用者收藏的房源資料
返回值:某使用者的房源地區偏好向量
'''
count4 = [0] * 9
vector_district = []
district_null = 0
for item in collect_house_info:
if item[3] == 'null':
district_null = district_null + 1
else:
count4 = self.district_classify(count4, item[3])
len_collect = float(len(collect_house_info))
for item in count4:
vector_district.append(item/(len_collect - district_null))
return vector_district
def CB_recommend(self, similar_weight):
'''
函式功能:計算每個房源與使用者偏好向量之間的加權相似度
引數:存放加權相似度的列表similar_weight
返回值:加權相似度的列表similar_weight
'''
'''分別計算每個使用者各特徵的偏好向量'''
vector_price = self.count_price(collect_house_info)
vector_area = self.count_area(collect_house_info)
vector_type = self.count_type(collect_house_info)
vector_district = self.count_district(collect_house_info)
price_cos_sim = []
area_cos_sim = []
type_cos_sim = []
district_cos_sim = []
for item in rent_house_info:
if item[5] == 1:
similar_weight.append(-9999)
else:
count1 = [0] * 10
count2 = [0] * 10
count3 = [0] * 6
count4 = [0] * 9
if item[1] is None:
sim_price = -9999
price_cos_sim.append(sim_price)
else:
count1 = self.price_classify(count1, item[1])
len_vector_A = np.sqrt(np.inner(vector_price, vector_price))
sim_price = self.cosine_similarity(vector_price, count1,len_vector_A)
price_cos_sim.append(sim_price)
try:
item[2] = int(item[2])
count2 = self.area_classify(count2, item[2])
len_vector_A = np.sqrt(np.inner(vector_area, vector_area))
sim_area = self.cosine_similarity(vector_area, count2, len_vector_A )
area_cos_sim.append(sim_area)
except:
sim_area = -9999
area_cos_sim.append(sim_area)
if item[3] == '' or item[3] == 'null':
sim_type = -9999
type_cos_sim.append(sim_type)
else:
count3 = self.type_classify(count3, item[3])
len_vector_A = np.sqrt(np.inner(vector_type, vector_type))
sim_type = self.cosine_similarity(vector_type, count3, len_vector_A)
type_cos_sim.append(sim_type)
if item[4] == 'null':
sim_district = -9999
district_cos_sim.append(sim_district)
else:
count4 = self.district_classify(count4, item[4])
len_vector_A = np.sqrt(np.inner(vector_district, vector_district))
sim_district = self.cosine_similarity(vector_district, count4, len_vector_A)
district_cos_sim.append(sim_district)
weight_cos = 0.35 * sim_price + 0.15 * sim_area + 0.15 * sim_type + 0.35 * sim_district
similar_weight.append(weight_cos)
return similar_weight
if __name__ == '__main__':
t1 = time.time()
test = recommend_house()
rent_house_info, user_house_info, collect_user_id = test.house_data()
id_num = 0
print "共有使用者數:",len(collect_user_id)
print "輸出格