python爬蟲 爬取大眾點評中所有行政區內的商戶 將獲取資訊存於excle中
阿新 • • 發佈:2019-01-03
import xlwt ''' 爬取網頁時直接出現403,意思是沒有訪問許可權 ''' import requests from bs4 import BeautifulSoup #入口網頁 start_url = 'https://www.dianping.com/search/category/344/10' def get_content(url,headers = None): response = requests.get(url,headers=headers)#發起了一次請求 html = response.content return html ''' 獲取所有行政區的url ''' def region_url(html): soup = BeautifulSoup(html,'lxml')#lxml解析器 #<div id="region-nav" class="nc-items "> # <a href="/search/category/344/10/r299"><span>芙蓉區</span></a> #列表推導式 base_url = 'https://www.dianping.com' region_url_list = [base_url+i['href'] for i in soup.find('div',id="region-nav").find_all('a')] return region_url_list #獲取商戶的詳情頁的url地址 #find:取第一個(返回一個具體的元素,沒有為null) find_all:匹配所有(返回列表,沒有返回[]) def get_shop_url(html): base_url = 'https://www.dianping.com' soup = BeautifulSoup(html,'lxml')#lxml解析器 shop_url_list = [base_url+i.find('a')['href'] for i in soup.find_all('div',class_='tit')] return shop_url_list #獲取所得資訊(店名,價格,評分)。。。解析頁面 def get_detail(html): soup = BeautifulSoup(html,'lxml')#lxml解析器 #<h1 class="shop-name">1911牛肉烤串</h1> title = soup.find('div',class_='breadcrumb').find('span').text #<span id="avgPriceTitle" class="item">人均:-</span> price = soup.find('span',id="avgPriceTitle").text #<span id="comment_score"><span class="item">口味:7.6</span><span class="item">環境:7.4</span><span class="item">服務:7.5</span></span> evaluation = soup.find('span',id="comment_score").find_all('span',class_="item")#評分的list #<span id="reviewCount" class="item">3條評論</span> comments = soup.find('span',id="reviewCount").text#評論的數量 # <div class="expand-info address" itemprop="street-address"> # <span class="item" itemprop="street-address" title="麓鬆路南豐港安置小區12棟"> # 麓鬆路南豐港安置小區12棟 # </span> # </div> address = soup.find('span',class_="item",itemprop="street-address").text.strip() # print u'店名'+title # for ev in evaluation: # print ev.text # print u'價格'+price # print u'評論數量'+comments # print u'地址'+address return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address) #檔案作為指令碼直接執行,而import到其他指令碼中是不會被執行的。 if __name__ =='__main__': items = [] headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', 'Cookie':'_hc.v=dd67ff67-20d0-6e83-7f61-ce93e4d46539.1503387665; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_cuid=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; _lxsdk=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; __utma=205923334.211352043.1503391484.1503391484.1503391484.1; __utmz=205923334.1503391484.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=29bc50ef1530ab64cbaa69b29cad64f39a_51868%3A1; s_ViewType=10; JSESSIONID=A49EED22A236962EA3506BA888799402; aburl=1; cy=344; cye=changsha; PHOENIX_ID=0a010918-15e0a223263-d4c1a92; __mta=146625163.1503391361571.1503401588676.1503408592089.10; _lxsdk_s=15e0a219034-38-9d5-acb%7C%7C37' } html = get_content(start_url) region_url_list = region_url(html) #遍歷所有行政區的所有商戶 for url in region_url_list:#遍歷所有的行政區 #簡單的出錯處理,有錯則略過 try: for n in range(1,51):#遍歷所有的50頁 html = get_content(url+'p'+str(n)) #所有商戶的詳情頁 shop_url_list = get_shop_url(html) for shop_url in shop_url_list: # print shop_url #提取資料,獲取 detail_html = get_content(shop_url,headers) ''' #403 Forbidden(沒有訪問許可權): (1)直接出現: (2)爬取一會兒出現403:可以通過代理ip解決 referer 防盜鏈 Host域名 Cookie ''' items.append(get_detail(detail_html)) except: continue new_table = r'F:\reptile_Python\daZhongDianPin_spiders\dzdp.xls' wb = xlwt.Workbook(encoding='utf-8') ws =wb.add_sheet('test1') headData = ['商戶名字','口味評分','環境評分','服務評分','人均價格','評論數量','地址'] for colnum in range(0,7): ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on')) index = 1 lens = len(items) for j in range(0,lens): for i in range(0,7): ws.write(index,i,items[j][i]) index=index+1 wb.save(new_table)