爬取大規模資料(1)
本文以58同城網站為例子
大概流程如下:
1、找到58類目頁的所有類目連結
2、設定資料庫(這裡使用MongoDB)
3、編寫兩個爬蟲分別爬取解析該類目下的所有商品連結、詳情頁資訊並存入資料庫中
4、
首先獲取所有類目的連結:
# channel_extract.py
from bs4 import BeautifulSoup
import requests
start_url = 'http://bj.58.com/sale.shtml' #類目頁
url_host = 'http://bj.58.com'
def getIndexURL(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'html.parser' )
links = soup.select("ul.ym-submnu > li > b >a")
for link in links:
page_url = url_host + link.get('href')
print(page_url)
# 把連結暫時存放在變數channel_list
channel_list = '''
http://bj.58.com/shouji/
http://bj.58.com/tongxunyw/
http://bj.58.com/danche/
http://bj.58.com/diandongche/
http://bj.58.com/fzixingche/
http://bj.58.com/sanlunche/
http://bj.58.com/peijianzhuangbei/
http://bj.58.com/diannao/
http://bj.58.com/bijiben/
http://bj.58.com/pbdn/
http://bj.58.com/diannaopeijian/
http://bj.58.com/zhoubianshebei/
http://bj.58.com/shuma/
http://bj.58.com/shumaxiangji/
http://bj.58.com/mpsanmpsi/
http://bj.58.com/youxiji/
http://bj.58.com/ershoukongtiao/
http://bj.58.com/dianshiji/
http://bj.58.com/xiyiji/
http://bj.58.com/bingxiang/
http://bj.58.com/jiadian/
http://bj.58.com/binggui/
http://bj.58.com/chuang/
http://bj.58.com/ershoujiaju/
http://bj.58.com/yingyou/
http://bj.58.com/yingeryongpin/
http://bj.58.com/muyingweiyang/
http://bj.58.com/muyingtongchuang/
http://bj.58.com/yunfuyongpin/
http://bj.58.com/fushi/
http://bj.58.com/nanzhuang/
http://bj.58.com/fsxiemao/
http://bj.58.com/xiangbao/
http://bj.58.com/meirong/
http://bj.58.com/yishu/
http://bj.58.com/shufahuihua/
http://bj.58.com/zhubaoshipin/
http://bj.58.com/yuqi/
http://bj.58.com/tushu/
http://bj.58.com/tushubook/
http://bj.58.com/wenti/
http://bj.58.com/yundongfushi/
http://bj.58.com/jianshenqixie/
http://bj.58.com/huju/
http://bj.58.com/qiulei/
http://bj.58.com/yueqi/
http://bj.58.com/kaquan/
http://bj.58.com/bangongshebei/
http://bj.58.com/diannaohaocai/
http://bj.58.com/bangongjiaju/
http://bj.58.com/ershoushebei/
http://bj.58.com/chengren/
http://bj.58.com/nvyongpin/
http://bj.58.com/qinglvqingqu/
http://bj.58.com/qingquneiyi/
http://bj.58.com/chengren/
http://bj.58.com/xiaoyuan/
http://bj.58.com/ershouqiugou/
http://bj.58.com/tiaozao/
http://bj.58.com/tiaozao/
http://bj.58.com/tiaozao/
'''
# 解析處理資料並存放如資料庫中
# page_parsing.py
from bs4 import BeautifulSoup
import requests
import pymongo
import time
#啟動mongodb
client = MongoClient(‘localhost’, 27017)
#給資料庫命名
ceshi = client[‘ceshi’]
#建立一個表單
url_list = ceshi[‘url_list’]
item_info = ceshi[‘item_info’]
#spider1
def getLinkFrom(channel, pages, who_sell=0):
”’3個引數分別是類目連結、頁碼、賣家(預設0為個人,1是商家)”’
#構造完整連結
full_link = ‘{}{}/pn{}’.format(channel, str(who_sell), str(pages))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, ‘html.parser’)
if soup.find(“td”, “t”):
for link in soup.select(“td.t a.t”):
item_link = link.get(‘href’).split(‘?’)[0]
url_list.insert_one({“url”:item_link})
elsr:
pass
# spider2
def getItemInfo(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, ‘html.parser’)
no_longer_exist = ‘404’ in soup.find(‘script’, type=’text/javascript’).get(‘src’).split(‘/’)
if no_longer_exist:
pass
else:
title = soup.title.text
price = soup.select(‘span.price’)[0].text
data = soup.select(”div.detail-title__info__text”)
area = soup.select(‘div.su_con > a’)[0].text
item_info.insert_one({‘title’:title, ‘price’:price,’post_data’:post_data, ‘area’:area, ‘url’:url})
getItemInfo(‘http://bj.58.com/shouji/34326399783213x.shtml‘)
好了