爬取51job招聘網
import urllib.request
from bs4 import BeautifulSoup
import time
import pymongo
import pymysql
#https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,2.html
def handle_request(keyword,page,url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
url = url.format(keyword,page)
request = urllib.request.Request(url=url,headers=headers)
return request
#用bs4解析
def parse_content(content,db):
soup = BeautifulSoup(content,‘lxml’)
div_list = soup.select(’#resultList > .el’)[1:]
# print(div_list) #逐一解析 for os in div_list: #公司職業 jobname = os.select('.t1 > span > a')[0]['title'] #公司名稱 company = os.select('.t2 > a ')[0]['title'] #工作地點 area = os.select('.t3')[0].string #薪資 salary = os.select('.t4')[0].string #釋出時間 publish_time = os.select('.t5')[0].string #print(salary,publishtime) items = { '公司職業':jobname, '公司名稱':company, '工作地點':area, '薪資':salary, '釋出時間':publish_time, } #String = str(items) #print(items) #fp.write(String,'\n') save_to_mysql(db,items) #fp.insert(items)
#第一種是儲存到mysql中
def connect_db():
db=pymysql.Connect(host=‘localhost’,port=3306,user=‘root’,password=‘123456’,database=‘51job’,charset=‘utf8’)
#兩種引擎,一種是innodb 一種是myisam
return db
#第二種是儲存到mongodb中
def connect_mongodb():
#連線mongodb
client = pymongo.MongoClient(host=‘localhost’,port=27017)
return client
#如果用mysql需要自己建立資料庫,再建立對應的表格
def save_to_mysql(db,items):
#獲取cursor
cursor = db.cursor()
#拼接sql語句
sql = 'insert into job(jobname, company, area, salary, publish_time) values("%s","%s","%s","%s","%s")' % (items['公司職業'], items['公司名稱'], items['工作地點'], items['薪資'], items['釋出時間'])
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
def main():
keyword = input(‘請輸入要搜尋的關鍵字-’)
start_page = int(input(‘請輸入起始頁碼-’))
end_page = int(input(‘請輸入結束頁碼-’))
url = ‘https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html’
#fp = open(‘job.txt’,‘w’,encoding=‘utf8’)
db = connect_db()
#一次遍歷每一頁的資料
#client = connect_mongodb()
#選擇mongodb的資料庫
#db = client.job51
#選擇mongodb的集合
#fp = db.job
for page in range(start_page,end_page + 1):
print('正在爬取--第%s頁--....' % page)
request = handle_request(keyword,page,url)
content = urllib.request.urlopen(request).read().decode('gbk')
parse_content(content,db)
print('結束爬取--第%s頁--...' % page)
time.sleep(2)
db.close()
#fp.close()
#client.close()
if name == ‘main’:
main()