爬取拉勾網資訊,翻頁爬取
阿新 • • 發佈:2019-01-22
import requests #這個庫等價於 urllib 和urllib2 import bs4 #作用是用來解析網頁的 import json#主要是一種資料交換格式 import time def main(): header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Host":"www.lagou.com","Referer":"https://www.lagou.com/jobs/list_python%20?labelWords=&fromSearch=true&suginput=", "X-Anit-Forge-Code":"0","X-Anit-Forge-Token":"None","X-Requested-With":"XMLHttpRequest"} positions=[] for i in range(1,31): data = { "first": "ture", "pn": i, "kd": "python"} result = requests.post("https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false",headers=header, data=data) json_result = result.json() #print(json_result) page_positions = json_result["content"]["positionResult"]["result"] positions.extend(page_positions) time.sleep(3) if i==2: break#這個只能一次爬取7頁,再爬的話會出現訪問次數太多。 line = json.dumps(positions,ensure_ascii=False) with open('C:/Users/dell/Desktop/python1.txt', 'w', encoding="utf-8") as f: f.write(line) if __name__ == '__main__': main()