使用selenium爬取餐廳資訊
阿新 • • 發佈:2018-12-16
一 工具及平臺介紹
使用python語言爬取 使用BeautifulSoup解析 爬取餐廳資訊——大眾點評某個地區的餐廳列表 匯入到CSV資料夾下 使用谷歌瀏覽器
二 程式碼主要部分解析
1.使用的庫:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv
2.根據url獲取到網頁的函式
def get_page(myurl): caps = webdriver.DesiredCapabilities().CHROME caps["marionette"] = False #進行設定不顯示圖片和js程式碼 options = webdriver.ChromeOptions() prefers = { 'profile.default_content_setting_values': { 'images': 2, 'javascript': 2 } } options.add_experimental_option('prefs', prefers) browser = webdriver.Chrome(chrome_options=options,desired_capabilities=caps) browser.get(myurl) soup = BeautifulSoup(browser.page_source, "lxml") return soup
3.解析獲取網頁的內容並存到csv檔案中
def get_infors(soup): title_list = soup.find_all('div', class_='tit') for item in title_list: title = item.find('h4').text link = item.find('a')['href'] name_list.append(title) link_list.append(link) infor_list = soup.find_all('div', class_='comment') for it in infor_list: #這裡使用try是因為有些餐廳沒有評論 try: comment = it.find('a', class_='review-num').b.text except: comment = '' comment_list.append(comment) #這裡使用try是因為有些餐廳沒有顯示人均消費價格 try: price = it.find('a',class_='mean-price').b.text except: price = '' price_list.append(price) for addr in (soup.find_all('div', class_='tag-addr')): addr_list.append(addr.find('span', class_='addr').text) for i in range(1, len(title_list)): output_list.append([name_list[i], link_list[i], comment_list[i],price_list[i], addr_list[i]]) return output_list
三 網頁完整程式碼
from selenium import webdriver from bs4 import BeautifulSoup from pymongo import MongoClient import time import csv url = "http://www.dianping.com/shenzhen/ch10" link_list = []#餐廳詳細內容連結 name_list = []#餐廳名 addr_list = []#餐廳地址 comment_list = []#評論數 price_list = []#人均消費 output_list = []#所有內容 #獲取餐廳網頁 def get_page(myurl): caps = webdriver.DesiredCapabilities().CHROME caps["marionette"] = False options = webdriver.ChromeOptions() prefers = { 'profile.default_content_setting_values': { 'images': 2, 'javascript': 2 } } options.add_experimental_option('prefs', prefers) browser = webdriver.Chrome(chrome_options=options,desired_capabilities=caps) browser.get(myurl) soup = BeautifulSoup(browser.page_source, "lxml") return soup # 爬取餐廳的資訊 def get_infors(soup): title_list = soup.find_all('div', class_='tit') for item in title_list: title = item.find('h4').text link = item.find('a')['href'] name_list.append(title) link_list.append(link) infor_list = soup.find_all('div', class_='comment') for it in infor_list: try: comment = it.find('a', class_='review-num').b.text except: comment = '' comment_list.append(comment) try: price = it.find('a',class_='mean-price').b.text except: price = '' price_list.append(price) for addr in (soup.find_all('div', class_='tag-addr')): addr_list.append(addr.find('span', class_='addr').text) for i in range(1, len(title_list)): output_list.append([name_list[i], link_list[i], comment_list[i],price_list[i], addr_list[i]]) return output_list #爬取1到30頁的餐廳資訊 for i in range(1, 30): if i == 1: url_link = url else: url_link = url+'/p'+str(i) mysoup = get_page(url_link) output_list = get_infors(mysoup) print(output_list[i]) #錄入到csv檔案裡,檔案路徑可自行更改 with open('F://restaurant.csv', 'a+', newline='') as csvfile: spamwriter = csv.writer(csvfile, dialect='excel') spamwriter.writerows(output_list) print("successfully") time.sleep(3)
結果
表格頭部是爬取後是自己加進去的,便於理解