前程無憂網站爬取
阿新 • • 發佈:2021-01-10
爬取前程無憂網站 寫入excel
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import json
import xlwt
lentitles=[]
job_href=[]
company_name=[]
providesalary_text=[]
workarea_text=[]
jobwelf=[]
companyind_text=[]
careerinfo=[]
workbook=xlwt.Workbook(encoding='utf-8')
worksheet=workbook.add_sheet( 'hzh zhaping wb')
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
def updateurl(url,headers):
ret = Request(url, headers=headers)
html = urlopen(ret)
bs = BeautifulSoup(html, "html.parser")
return bs
for link in range(1,2):
url="https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format (link)
bs = updateurl(url, headers)
info=bs.find_all('script',{"type":"text/javascript"})
def get_V(dct,key):
return dct[key] if key in dct else 'none'
for x in info:
t=x.get_text()
if len(t)>0:
t=t.replace("window.__SEARCH_RESULT__ = ","")
# print(t)
jsonobj = json.loads(t)
for i in jsonobj['engine_search_result']:
job_name=get_V(i,'job_name')
lentitles.append(job_name)
# print(lentitles)
job_href1 = get_V(i, 'job_href')
job_href.append(job_href1)
# print(job_href1)
take=updateurl(job_href1,headers)
job_href2=take.find_all('div',class_='bmsg job_msg inbox')
for kk in job_href2:
try:
careerinfo.append(kk.get_text())
except:
careerinfo.append('')
company_name1 = (get_V(i, 'company_name'))
company_name.append(company_name1)
# print(company_name)
providesalary_text1 = (get_V(i, 'providesalary_text'))
providesalary_text.append(providesalary_text1)
workarea_text1 = (get_V(i, 'workarea_text'))
workarea_text.append(workarea_text1)
jobwelf1 = (get_V(i, 'jobwelf'))
jobwelf.append(jobwelf1)
companyind_text1 = (get_V(i, 'companyind_text'))
companyind_text.append(companyind_text1)
# print(len(careerinfo))
# print(len(lentitles))
# print(careerinfo)
careerinfo.append('')
# print(len(careerinfo))
# print(job_href1)
worksheet.write(0,0,label='職位名稱')
worksheet.write(0,1,label='職業網址')
worksheet.write(0,2,label='公司名字')
worksheet.write(0,3,label='薪資福利')
worksheet.write(0,4,label='工作地址')
worksheet.write(0,5,label='工作福利')
worksheet.write(0,6,label='公司前景')
worksheet.write(0,7,label='職業資訊')
for i in range(1,len(lentitles)):
worksheet.write(i, 0, label=lentitles[i])
worksheet.write(i, 1, label=job_href[i])
worksheet.write(i, 2, label=company_name[i])
worksheet.write(i, 3, label=providesalary_text[i])
worksheet.write(i, 4, label=workarea_text[i])
worksheet.write(i, 5, label=jobwelf[i])
worksheet.write(i, 6, label=companyind_text[i])
worksheet.write(i, 7,label=careerinfo[i])
workbook.save('hzh_sevse.xls')