爬取拉勾熱門城市“資料分析”崗位,並進行視覺化分析
阿新 • • 發佈:2019-01-04
首先,寫一個爬取崗位的爬蟲,如下:
# -*- coding:utf-8 -*- from json import JSONDecodeError import requests import time import pandas as pd # 獲取儲存職位資訊的json物件,遍歷獲得公司名、福利待遇、工作地點、學歷要求、工作型別、釋出時間、職位名稱、薪資、工作年限 companyFullName = [] job_city = [] companySize = [] positionId = [] companyId = [] positionName = [] secondType = [] positionLables = [] industryField = [] industryLables = [] salary = [] positionAdvantage = [] workYear = [] stationname = [] education = [] createTime = [] longitude = [] latitude = [] info_dict = dict() def get_json(url, datas): my_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=', } cookies = { 'Cookie': '你的cookie' } for x in range(3): content = requests.post(url=url, cookies=cookies, headers=my_headers, data=datas) # content.encoding = 'utf-8' try: result = content.json() except JSONDecodeError: print('=====================解析失敗==============================\n',content) if 'content' not in result: print("=====================沒有資料==============================") time.sleep(60) continue info = result['content']['positionResult']['result'] print(info) if len(info) < 1: return False for job in info: job_city.append(job['city']) # print(job['city']) companyId.append(job['companyId']) companyFullName.append(job['companyFullName']) companySize.append(job['companySize']) positionId.append(job['positionId']) positionName.append(job['positionName']) secondType.append(job['secondType']) positionLables.append(job['positionLables']) industryField.append(job['industryField']) industryLables.append(job['industryLables']) salary.append(job['salary']) positionAdvantage.append(job['positionAdvantage']) workYear.append(job['workYear']) stationname.append(job['stationname']) education.append(job['education']) createTime.append(job['createTime']) longitude.append(job['longitude']) latitude.append(job['latitude']) # break return True def main(): global citys for city in citys: for x in range(1, 30): url = 'https://www.lagou.com/jobs/positionAjax.json?&needAddtionalResult=false' datas = { 'first': True, 'pn': x, 'kd': '資料分析', 'city': city } isContinue = get_json(url, datas) if not isContinue: break time.sleep(20) time.sleep(10) info_dict['city'] = job_city info_dict['companyId'] = companyId info_dict['companyFullName'] = companyFullName info_dict['companySize'] = companySize info_dict['positionId'] = positionId info_dict['positionName'] = positionName info_dict['secondType'] = secondType info_dict['positionLables'] = positionLables info_dict['industryField'] = industryField info_dict['industryLables'] = industryLables info_dict['salary'] = salary info_dict['positionAdvantage'] = positionAdvantage info_dict['workYear'] = workYear info_dict['stationname'] = stationname info_dict['education'] = education info_dict['longitude'] = longitude info_dict['latitude'] = latitude frame = pd.DataFrame(info_dict) frame.to_csv("LGTotal.csv") if __name__ == '__main__': citys = ['北京', '上海', '廣州', '深圳', '杭州', '廈門','成都','南京','武漢','西安','長沙','南京','天津','蘇州'] main()
開啟儲存的csv檔案,部分資料如下:
通過清洗一些空資料和拆分薪資上下限等,進一步進行分析,並且視覺化,視覺化的工具為power bi。視覺化結果如下