爬取知乎熱榜標題和連接 (python,requests,xpath)
阿新 • • 發佈:2019-01-21
app 分享圖片 dea -a mar margin 瀏覽器 判斷 agen
用python爬取知乎的熱榜,獲取標題和鏈接。
環境和方法:ubantu16.04、python3、requests、xpath
1.用瀏覽器打開知乎,並登錄
2.獲取cookie和User—Agent
3.上代碼
1 import requests 2 from lxml import etree 3 4 def get_html(url): 5 headers={ 6 ‘Cookie‘:‘‘, 7 #‘Host‘:‘www.zhihu.com‘, 8 ‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘ 9 } 10 11 r=requests.get(url,headers=headers) 12 13 if r.status_code==200: 14 deal_content(r.text) 15 16 def deal_content(r): 17 html = etree.HTML(r) 18 title_list = html.xpath(‘//*[@id="TopstoryContent"]/div/section/div[2]/a/h2‘) 19 link_list = html.xpath(‘//*[@id="TopstoryContent"]/div/section/div[2]/a/@href‘) 20 for i in range(0,len(title_list)): 21 print(title_list[i].text) 22 print(link_list[i]) 23 with open("zhihu.txt",‘a‘) as f: 24 f.write(title_list[i].text+‘\n‘) 25 f.write(‘\t鏈接為:‘+link_list[i]+‘\n‘) 26 f.write(‘*‘*50+‘\n‘) 27 28 def main(): 29 url=‘https://www.zhihu.com/hot‘ 30 get_html(url) 31 32 main() 33 34 import requests #用requests方式獲取 35 from lxml import etree 36 37 def get_html(url): 38 ‘‘‘get_html獲取源代碼 39 ‘‘‘url:為知乎的網址 40 headers={ 41 ‘Cookie‘:‘tgw_l7_route=73af20938a97f63d9b695ad561c4c10c; _zap=f99d7200-6422-498f-8df0-39844d1abd7c; _xsrf=iDbABkOnhb6jA1wvKWFdPqjSeEMav66V; d_c0="ADBhKiaX2Q6PToVYr9fc-9UzlLt0E9RBAEs=|1547900414"; capsion_ticket="2|1:0|10:1547900418|14:capsion_ticket|44:NGVlNTI5Mzk4YTEzNDhlZTk4ZjkxZDg1YTg1OTMxMDg=|cbfda19dac16d7fd90188a21abc0001da68faa88f2013a59b36d497528e922e1"; z_c0="2|1:0|10:1547900438|4:z_c0|92:Mi4xckloRUF3QUFBQUFBOE9FaEpwZlpEaWNBQUFDRUFsVk5GcU5xWEFES2t0M2dVR2lXbWpLVUZHTGp0RDhaU0o1bzln|cc0ba8245e1572aac608d462e9aaabf1b817cb9bd16cbcc819d6f148a8cb009e"; tst=r; q_c1=ca06c3e96ec442179a3409aec6f974dc|1547900440000|1547900440000‘, 42 #‘Host‘:‘www.zhihu.com‘, 43 ‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘ 44 } 45 46 r=requests.get(url,headers=headers) #獲取網頁源碼 47 48 if r.status_code==200: #判斷狀態碼是否為200,即判斷是否成功獲取原網頁 49 deal_content(r.text) #調用函數處理網頁內容 50 51 def deal_content(r): 52 ‘‘‘ 53 deal_content()處理網頁內容,並寫入txt文件中 54 r為傳入參數,是網頁內容 55 相關內容寫入zhihu.txt中 56 ‘‘‘ 57 html = etree.HTML(r) #將網頁內容轉換為lxml格式 58 title_list = html.xpath(‘//*[@id="TopstoryContent"]/div/section/div[2]/a/h2‘) #獲取標題 59 link_list = html.xpath(‘//*[@id="TopstoryContent"]/div/section/div[2]/a/@href‘) #獲取連接 60 for i in range(0,len(title_list)): 61 #print(title_list[i].text) 62 #print(link_list[i]) 63 with open("zhihu.txt",‘a‘) as f: 64 f.write(title_list[i].text+‘\n‘) 65 f.write(‘\t鏈接為:‘+link_list[i]+‘\n‘) 66 f.write(‘*‘*50+‘\n‘) 67 68 def main(): 69 url=‘https://www.zhihu.com/hot‘ 70 get_html(url) 71 72 main()
4.爬取結果
爬取知乎熱榜標題和連接 (python,requests,xpath)