我愛我家房源資訊爬取
阿新 • • 發佈:2019-01-31
我愛我家房源資訊獲取
- 無特殊爬取需求
import requests
from lxml import etree
from mysql_link import mysql_connect
def get_5i5j(count):
mysql_ = mysql_connect()
headers = {
'Cookie': '_Jo0OQK=6B2EFBECBAB6D76BCDB834644B1B2D3BC2FFE7FE5ECC9F67E588A57175B2C4A553BB1B99580083D10FBE3107B2235A474021805425FF6DC8C7E536BB944BCFF6EB1DE8682CA7D10E3B498FB9E3C853EFEE298FB9E3C853EFEE215D8BEE34E43E5C0GJ1Z1Jw==; PHPSESSID=u518ep3lfv9sig9rt3jfdtrf2j; _ga=GA1.2.656332641.1534582894; _gid=GA1.2.1467514563.1534582894; yfx_c_g_u_id_10000001=_ck18081817013612438566341316835; yfx_f_l_v_t_10000001=f_t_1534582896243__r_t_1534582896243__v_t_1534582896243__r_c_0; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534582902; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534583124' ,
'Host': 'bj.5i5j.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
}
url = 'https://bj.5i5j.com/zufang/huilongguan/n%d/'
for num in range(1, count+1):
full_url = url%num
response = requests.get(full_url,headers=headers)
# with open('5i5j.html','wb')as f:
# f.write(response.content)
html_ele = etree.HTML(response.text)
li_list = html_ele.xpath('//div[@class="list-con-box"]/ul/li')
for li in li_list:
title = li.xpath('./div[2]/h3/a/text()')
# if len(title):
#
print(title[0])
# 面積
area = li.xpath('./div[2]/div[1]/p[1]/text()')
print(area[0])
# 位置整理
distance = li.xpath('./div[2]/div[1]/p[2]/text()')
# print(distance)
distance1 = li.xpath('./div[2]/div[1]/p[2]/a/text()')
# print(distance1)
if len(distance)>0:
d = distance1+distance
dd = d[0]+d[1]
else:
dd = distance1[0]
print(dd)
# 價格
price = li.xpath('./div[2]/div[1]/div/p/strong')
print(price[0].text)
sql = 'insert into 5i5j (title,area,descr,price)values("{}","{}","{}","{}")'.format(title[0],area[0],dd,price[0].text)
print(sql)
mysql_.mysql_do(sql)
# print(title[0].replace(' '*2,""))
if __name__ == '__main__':
get_5i5j(3)