xpath匹配 爬取房源資訊(我愛我家)
阿新 • • 發佈:2019-02-17
# requests包 import requests # xpath包 from lxml import etree # 本地mysql包(mysql_def包下mysql_conn函式) from mysql_def import mysql_conn # 定義url,含分頁 %d base_url='https://bj.5i5j.com/zufang/changpingqu/n%d/' # 定義headers頭 headers ={ "Cookie": "_Jo0OQK=27E3AE8F401F48377EC641A97E866EA9401E4BF430D59F325019A8A1C06A982D5A716B0F691F396132740C3E5383A69B7EF67E84EC402AE9D0D7E4FE54D996F94C4DE8682CA7D10E3B498FB9E3C853EFEE298FB9E3C853EFEE215D8BEE34E43E5C0GJ1Z1SA==; yfx_c_g_u_id_10000001=_ck18081814101818670131507573338; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; _ga=GA1.2.510886705.1534572619; _gid=GA1.2.416216966.1534572619; domain=bj; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3Abaidu_ppc%3A%3A%25e6%2588%2591%25e7%2588%25b1%25e6%2588%2591%25e5%25ae%25b6%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Awww.baidu.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=%25e6%2588%2591%25e7%2588%25b1%25e6%2588%2591%25e5%25ae%25b6; PHPSESSID=m46frphieprtkvuci1tpmhnqke; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534572623,1534577980,1534579440; _gat=1; yfx_f_l_v_t_10000001=f_t_1534572618547__r_t_1534572618547__v_t_1534579696185__r_c_0; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534579697", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } def a(): # 通過迴圈將頁碼數 %i 傳給 url for i in range(1,4): url = base_url %i # 發起requests.get請求,請求頁面 response = requests.get(url,headers=headers) # 使用 xpath 對要獲取的欄位進行匹配,匹配至擁有(共同標籤)位置 html=etree.HTML(response.text) html_res=html.xpath('//div[@class="list-con-box"]/ul[@class="pList"]/li') # 呼叫 b() 函式,將所匹配到(共同擁有)的欄位 進行傳遞 b(html_res) # 提示爬取第幾頁 使用str()將i轉換為字串,進行拼接 print('爬取第'+str(i)+'頁中') def b(html_res): # 遍歷共同欄位 for i in html_res: # 提取想要欄位 listTit=i.xpath('./div[2]/h3/a/text()')[0] # 提取想要欄位 listX=i.xpath('./div[2]/div[1]/p[1]/text()')[0] # 提取想要欄位 dizhi1=i.xpath('./div[2]/div[1]/p[2]/a/text()')[0] # 通過 ''.join() 將獲取值轉換成 str dizhi2=''.join(i.xpath('./div[2]/div[1]/p[2]/text()')) # 判斷dizhi2欄位是否為空(頁面中存在空的情況) if dizhi2==None: # 如果為空,重新賦值為空字串 dizhi2='' # 將兩個獲取欄位進行拼接 dizhi=dizhi1+dizhi2 # 提取想要欄位 redC=i.xpath('./div[2]/div[1]/div/p/strong/text()')[0] # 存入mysql repr():原樣輸出 sql = 'insert into woaiwojia(listTit,listX,dizhi,redC) values ({},{},{},{})'.format(repr(listTit), repr(listX),repr(dizhi), repr(redC)) # 開啟 mysql mc = mysql_conn() mc.execute_modify_mysql(sql) if __name__ == '__main__': a()