根據搜尋內容爬取招聘網的職位招聘資訊
阿新 • • 發佈:2019-02-17
程式碼:
import requests from bs4 import BeautifulSoup import time def getHtml(url,code='gbk'): try: r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except: return "" def ParserHtml(i,htmlText): print("正在解析第{0}頁".format(i)) RecuitInfos = [] soup = BeautifulSoup(htmlText,'lxml') #獲取職位資訊 InfoPositions = soup.find_all('p',attrs={'class':'t1 '}) #獲取公司名資訊 InfoNames = soup.find_all('span',attrs={'class':'t2'}) #獲取工作地點 InfoPlaces = soup.find_all('span', attrs={'class': 't3'}) #獲取薪資資訊 InfoSalarys = soup.find_all('span', attrs={'class': 't4'}) #獲取招聘釋出條件資訊 InfoTimes = soup.find_all('span', attrs={'class': 't5'}) for m in range(1,len(InfoPositions)): if (len(InfoPositions[m-1].text.split())==0)|(len(InfoNames[m].text.split())==0)|(len(InfoPlaces[m].text.split())==0)|(len(InfoSalarys[m].text.split())==0)|((InfoTimes[m].text.split())==0): pass else: #print(len(InfoPositions),len(InfoNames),len(InfoPlaces),len(InfoSalarys),len(InfoTimes)) #print(InfoPositions[m-1].text.split()[0],InfoNames[m].text.split()[0],InfoPlaces[m].text.split()[0],InfoSalarys[m].text.split()[0],InfoTimes[m].text.split()[0]) RecuitInfos.append([InfoPositions[m-1].text.split()[0],InfoNames[m].text.split()[0],InfoPlaces[m].text.split()[0],InfoSalarys[m].text.split()[0],InfoTimes[m].text.split()[0]]) return RecuitInfos def writeCSV(i,fw,Recruit_info): for Info in Recruit_info: print("正在寫入第{0}頁".format(i)) fw.write(",".join(Info)+'\n') print("第{0}資料抓取完畢".format(i)) def main(): path = 'F:' posttion = input("請輸入要抓取的職位名稱:") fw = open(path +'\zhaopin_'+ posttion+'.csv', 'a+') row = ["職位名","公司名","工作地點","薪資","釋出時間"] fw.write(",".join(row)+"\n") star_url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' mid_url = ',2,' end_url = '.html?' max = input("請輸入最大抓取頁數:") for i in range(1,int(max)): time.sleep(3) url = star_url + posttion +mid_url + str(i) + end_url htmlText = getHtml(url) Recruit_info = ParserHtml(i,htmlText) writeCSV(i,fw, Recruit_info) if __name__ == '__main__': main()