1. 程式人生 > >xpath匹配 爬取房源資訊(我愛我家)

xpath匹配 爬取房源資訊(我愛我家)

# requests包
import requests
# xpath包
from lxml import etree
# 本地mysql包(mysql_def包下mysql_conn函式)
from mysql_def import mysql_conn

# 定義url,含分頁 %d
base_url='https://bj.5i5j.com/zufang/changpingqu/n%d/'
# 定義headers頭
headers ={
    "Cookie": "_Jo0OQK=27E3AE8F401F48377EC641A97E866EA9401E4BF430D59F325019A8A1C06A982D5A716B0F691F396132740C3E5383A69B7EF67E84EC402AE9D0D7E4FE54D996F94C4DE8682CA7D10E3B498FB9E3C853EFEE298FB9E3C853EFEE215D8BEE34E43E5C0GJ1Z1SA==; yfx_c_g_u_id_10000001=_ck18081814101818670131507573338; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; _ga=GA1.2.510886705.1534572619; _gid=GA1.2.416216966.1534572619; domain=bj; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3Abaidu_ppc%3A%3A%25e6%2588%2591%25e7%2588%25b1%25e6%2588%2591%25e5%25ae%25b6%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Awww.baidu.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=%25e6%2588%2591%25e7%2588%25b1%25e6%2588%2591%25e5%25ae%25b6; PHPSESSID=m46frphieprtkvuci1tpmhnqke; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534572623,1534577980,1534579440; _gat=1; yfx_f_l_v_t_10000001=f_t_1534572618547__r_t_1534572618547__v_t_1534579696185__r_c_0; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534579697",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
}

def a():
    # 通過迴圈將頁碼數 %i 傳給 url
    for i in range(1,4):
        url = base_url %i
        # 發起requests.get請求,請求頁面
        response = requests.get(url,headers=headers)
        # 使用 xpath 對要獲取的欄位進行匹配,匹配至擁有(共同標籤)位置
        html=etree.HTML(response.text)
        html_res=html.xpath('//div[@class="list-con-box"]/ul[@class="pList"]/li')
        # 呼叫 b() 函式,將所匹配到(共同擁有)的欄位 進行傳遞
        b(html_res)
        # 提示爬取第幾頁 使用str()將i轉換為字串,進行拼接
        print('爬取第'+str(i)+'頁中')

def b(html_res):
    #  遍歷共同欄位
    for i in html_res:
        # 提取想要欄位
        listTit=i.xpath('./div[2]/h3/a/text()')[0]
        # 提取想要欄位
        listX=i.xpath('./div[2]/div[1]/p[1]/text()')[0]
        # 提取想要欄位
        dizhi1=i.xpath('./div[2]/div[1]/p[2]/a/text()')[0]
        # 通過 ''.join() 將獲取值轉換成 str
        dizhi2=''.join(i.xpath('./div[2]/div[1]/p[2]/text()'))
        # 判斷dizhi2欄位是否為空(頁面中存在空的情況)
        if dizhi2==None:
            # 如果為空,重新賦值為空字串
            dizhi2=''
        # 將兩個獲取欄位進行拼接
        dizhi=dizhi1+dizhi2
        # 提取想要欄位
        redC=i.xpath('./div[2]/div[1]/div/p/strong/text()')[0]

        # 存入mysql   repr():原樣輸出
        sql = 'insert into woaiwojia(listTit,listX,dizhi,redC) values ({},{},{},{})'.format(repr(listTit), repr(listX),repr(dizhi), repr(redC))
        # 開啟 mysql
        mc = mysql_conn()
        mc.execute_modify_mysql(sql)


if __name__ == '__main__':
    a()