1. 程式人生 > 其它 >前程無憂網站爬取

前程無憂網站爬取

爬取前程無憂網站 寫入excel

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import json
import xlwt
lentitles=[]
job_href=[]
company_name=[]
providesalary_text=[]
workarea_text=[]
jobwelf=[]
companyind_text=[]
careerinfo=[]

workbook=xlwt.Workbook(encoding='utf-8')
worksheet=workbook.add_sheet(
'hzh zhaping wb') headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'} def updateurl(url,headers): ret = Request(url, headers=headers) html = urlopen(ret) bs = BeautifulSoup(html, "html.parser") return
bs for link in range(1,2): url="https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format
(link) bs = updateurl(url, headers) info=bs.find_all('script',{"type":"text/javascript"}) def get_V(dct,key): return dct[key] if key in dct else 'none' for x in info: t=x.get_text() if len(t)>0: t=t.replace("window.__SEARCH_RESULT__ = ","") # print(t) jsonobj = json.loads(t) for i in jsonobj['engine_search_result']: job_name=get_V(i,'job_name') lentitles.append(job_name) # print(lentitles) job_href1 = get_V(i, 'job_href') job_href.append(job_href1) # print(job_href1) take=updateurl(job_href1,headers) job_href2=take.find_all('div',class_='bmsg job_msg inbox') for kk in job_href2: try: careerinfo.append(kk.get_text()) except: careerinfo.append('') company_name1 = (get_V(i, 'company_name')) company_name.append(company_name1) # print(company_name) providesalary_text1 = (get_V(i, 'providesalary_text')) providesalary_text.append(providesalary_text1) workarea_text1 = (get_V(i, 'workarea_text')) workarea_text.append(workarea_text1) jobwelf1 = (get_V(i, 'jobwelf')) jobwelf.append(jobwelf1) companyind_text1 = (get_V(i, 'companyind_text')) companyind_text.append(companyind_text1) # print(len(careerinfo)) # print(len(lentitles)) # print(careerinfo) careerinfo.append('') # print(len(careerinfo)) # print(job_href1) worksheet.write(0,0,label='職位名稱') worksheet.write(0,1,label='職業網址') worksheet.write(0,2,label='公司名字') worksheet.write(0,3,label='薪資福利') worksheet.write(0,4,label='工作地址') worksheet.write(0,5,label='工作福利') worksheet.write(0,6,label='公司前景') worksheet.write(0,7,label='職業資訊') for i in range(1,len(lentitles)): worksheet.write(i, 0, label=lentitles[i]) worksheet.write(i, 1, label=job_href[i]) worksheet.write(i, 2, label=company_name[i]) worksheet.write(i, 3, label=providesalary_text[i]) worksheet.write(i, 4, label=workarea_text[i]) worksheet.write(i, 5, label=jobwelf[i]) worksheet.write(i, 6, label=companyind_text[i]) worksheet.write(i, 7,label=careerinfo[i]) workbook.save('hzh_sevse.xls')