python3 爬蟲 爬取智聯招聘崗位資訊
阿新 • • 發佈:2019-02-16
這套程式基於python3 ,使用requests和re正則表示式,只需要將程式儲存為.py檔案後,即可將抓取到的資料儲存到指定路徑的Excel檔案中。程式在終端中啟動,啟動命令:
#python3 檔名.py 關鍵字 城市
python3 zhilian.py python 杭州
程式碼如下:
# coding:utf-8
import requests
import re
import xlwt
import sys,os
workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet 1' , cell_overwrite_ok=True)
class ZhiLian(object):
def __init__(self):
self.start_url = 'https://m.zhaopin.com/{}/?keyword={}&pageindex={}&maprange=3&islocation=0&order=4'
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36"
}
self.test_url = '<section class="job-list.*?".*?>.*?<div class="job-name fl ">(.*?)</div>.*?<div class="fl">(.*?)</div>.*?<div class="comp-name fl">(.*?)</div>.*?<span class="ads">(.*?)</span>.*?<div class="time fr">(.*?)</div>'
self.select_city_url = 'https://m.zhaopin.com/searchjob/selectcity'
self.test_city = ' <a data-code="(.*?)" href="/(.*?)/">(.*?)</a>'
def parse_url(self, url):
'''傳送請求'''
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_data(self, test_url, content):
'''獲取資料'''
content_list = re.findall(test_url, content, re.S)
return content_list
def get_content(self, content_list, DATA):
'''提取資料'''
for content in content_list:
DATA.append((content[0], content[1], content[2], content[3], content[4]))
def save_content(self, DATA, city, key_words):
'''儲存到excel'''
for i, row in enumerate(DATA):
for j, col in enumerate(row):
booksheet.write(i, j, col)
#判斷儲存的路徑,如果和我的路徑不一樣,會自動儲存到當前程式檔案所在目錄
if(os.path.isdir('/home/itcast/Desktop/')):
workbook.save('/home/itcast/Desktop/{}_{}.xls'.format(city,key_words))
else:
workbook.save('{}_{}.xls'.format(city, key_words))
def select_city(self, url, search_city):
'''選擇城市,返回城市程式碼'''
city_dict = {}
city_code = None
r = requests.get(url, headers=self.headers)
content = r.content.decode()
city_content = re.findall(self.test_city, content, re.S)
# print(city_content)
# print(len(city_content))
# 構造一個字典儲存城市資訊
for city in city_content:
# '566': ['tangshan', '唐山']
city_dict[city[0]] = [city[1], city[2]]
# print(len(city_dict))
for keys, value in city_dict.items():
if search_city == value[1]:
city_code = value[0] + '-' + keys
# print(city_code)
return city_code
def deal_city(self, city):
'''處理城市資訊'''
city_code = self.select_city(self.select_city_url, city)
if city_code == None:
print("查詢城市不存在,請重試")
sys.exit()
return city_code
def run(self, city, key_words):
# 1.start_url
# 2.傳送請求,獲取響應
i = 1
DATA = [('崗位', '月薪', '公司', '城市', '釋出時間')]
city_code = self.deal_city(city)
while True:
url = self.start_url.format(city_code, key_words, i)
content = self.parse_url(url)
content_list = self.get_data(self.test_url, content)
self.get_content(content_list, DATA)
# 儲存資料
self.save_content(DATA, city, key_words)
# 判斷是否還有資料 限制儲存最大頁數
if (len(content_list) == 0 or i>100):
print("儲存完成,共{}頁資料".format(i - 1))
break
print("正在儲存第{}頁資料".format(i))
i += 1
if __name__ == '__main__':
key_words = sys.argv[1]
city = sys.argv[2]
zhilian = ZhiLian()
zhilian.run(city, key_words)
爬取結果如下: