51job多執行緒爬取指定職業資訊資料
阿新 • • 發佈:2020-10-07
51job多執行緒爬取指定職業資訊資料
# datetime:2020/10/7 14:02 # 51job多執行緒 import requests import chardet from bs4 import BeautifulSoup import csv from openpyxl import Workbook import random import time import threading def getOnePageInfo(url): # 訪問連結 res = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'} ) # 轉為beautifulsoup物件 soup = BeautifulSoup(res.text, 'html.parser') # 那麼我們只能按照實際得到的物件來找資訊 allstring = soup.find_all('script')[-4].string # allstring=soup.find_all('script')[-4].text # 1:使用 = 分割1次, 的第二個值就是所有資料 data = allstring.split('=', 1)[-1] # 2 : index = allstring.find('{') data2 = allstring[index:] # 1使用eval()將字串轉換為相關資料 dict_data = eval(data) bigdata = [] for each in dict_data['engine_search_result']: oneInfo = [] # 職位名 job_name oneInfo.append(each.get('job_name')) # 公司名 commpany_name oneInfo.append(each.get('company_name')) # 薪資 providesalary_text oneInfo.append(each.get('providesalary_text')) # 工作地點 workarea_text oneInfo.append(each.get('workarea_text')) # 釋出日期 updatedate oneInfo.append(each.get('updatedate')) # 公司型別 companytype_text oneInfo.append(each.get('companytype_text')) # 額外資訊 attribute_text oneInfo.append(str(each.get('attribute_text'))) # 所屬行業 companyind_text oneInfo.append(each.get('companyind_text')) # 將最後一條資訊放入bigdata bigdata.append(oneInfo) return bigdata # 儲存二維列表專用類 class MySave(): def __init__(self): pass def saveToCsv(self, data, fileName: str, mode='w'): with open(fileName, mode=mode, encoding='utf-8', newline='')as f: csvfile = csv.writer(f) # 寫入data for each in data: csvfile.writerow(each) print(fileName, '儲存完成') def saveToExcel(self, data, fileName): # 例項化工作簿物件 wb = Workbook() # 準備工作表 sheet = wb.active # 寫入資料 for each in data: sheet.append(each) wb.save(fileName) print(fileName, '儲存完成') # 抓多頁 jobName = input("請輸入搜尋關鍵詞:") def getJobInfo(jobName, startNum, endNum): for i in range(startNum, endNum): time.sleep(random.randint(1, 3)) # 拼接連結 url = f'http://search.51job.com/list/000000,000000,0000,00,9,99,' + jobName + ',2,' + str(i) + '.html' print(f'正在抓取第{i}頁') # 執行函式訪問url,返回資料 data = getOnePageInfo(url) save = MySave() # 儲存到csv save.saveToCsv(data,'51job資料.csv','a') # 設定四個執行緒 t1 = threading.Thread(target=getJobInfo, args=(jobName, 1, 25)) t2 = threading.Thread(target=getJobInfo, args=(jobName, 25, 50)) t3 = threading.Thread(target=getJobInfo, args=(jobName, 51, 75)) t4 = threading.Thread(target=getJobInfo, args=(jobName, 75, 100)) # 開啟四個執行緒 t1.start() t2.start() t3.start() t4.start()