7.1 python拉勾網實戰並儲存到mongodb
阿新 • • 發佈:2019-01-09
拉鉤網實戰
爬取拉勾網有關“爬蟲”的職位資訊,並把爬取的資料儲存在MongoDB資料庫中
百度搜索requests
post與get不同的是,post多傳一個data引數,如第一個圖中紅框中下部分form data資料。js格式和get一樣傳入headers引數。
微博爬取難點,js載入,請求不好找
當pc端不好爬時可以考慮移動端,移動端反爬比較少。即m.網址 移動端。
- 確定網頁的載入方式是JavaScript載入
- 通過谷歌瀏覽器開發者工具分析和尋找網頁的真實請求,確定真實資料在position.Ajax開頭的連結裡,請求方式是POST
- 使用requests的post方法獲取資料,發現並沒有返回想要的資料,說明需要加上headers
- 加上headers的’Cookie’,’User-Agent’,’Referer’等資訊,成功返回資料
- 再把返回的對應資料儲存到MongoDB
程式碼執行結果如下: 搜尋關鍵字的方法為以上的程式碼。 而抓取某個類目下的所有頁面職位,可以直接修改上面位址列連結中的數字即可 補充知識:類,函式,方法的區別與聯絡: 函式,就是return一個物件。方法,主要在class中提到較多,建立類時,可以定義一個特定的方法,名為__init__(),只要建立這個類的一個例項就會執行這個方法。可以向__init__()方法傳遞引數,這樣建立物件時就可以把屬性設定為你希望的值 在類地內部,使用def關鍵字可以為類定義一個方法,與一般函式定義不同,類方法必須包含引數self,且為第一個引數 沒太大區別,只是方法的話,前面要加一個 self引數 共同點是,都要return一個物件#! /usr/bin/env python # -*- coding:utf-8 -*- import requests from pymongo import MongoClient client = MongoClient() db = client.lagou #連線test資料庫,沒有會自動建立 my_set = db.job #使用set集合,沒有回自動建立 url ='https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0' data ={ 'first':'false', 'pn':'2', 'kd':'資料分析' } #引數部分 headers = { 'Cookie':'_ga=GA1.2.637061448.1517466112; _gid=GA1.2.1676212693.1517466112; user_trace_token=20180201142155-337058a4-0718-11e8-abe5-5254005c3644; LGSID=20180201142155-33706056-0718-11e8-abe5-5254005c3644; PRE_UTM=m_cf_cpc_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0f0000aPftK-mFJ6dmD9WqYAqP4eFPlMwr50wN63SmmOfpGbUN06xZzlY-1Fy5zprFS2gaarx36xE2VDwuytXzmBrVLvsIw2jSLujeJByF5rndRHZMee5bfJkoDTZAFQgW1AIWaUr-zwYy_O-ZWFhqtBl6ASsCrLpEneH2AZLGJaGEuXds.7R_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzL4XNPIIhExz4rMThEgz3x5Gse5gj_L3x5x9L4n5VL3x5ksSEzselt5M33xUl3ISqi_nYQAl1u3qB6.U1Yk0ZDqs2v4_tL3dU30mywkXHL0oUh11xWNE6K9uZ7Y5Hc0TA-W5HD0IjL0oUh11xWNE6KGUHYznWR0u1dsThc0Iybqmh7GuZR0TA-b5HD0mv-b5Hn4n0KVIjYknjD4g1DsnHIxn10kPNt1PW0k0AVG5H00TMfqPWcY0ANGujYzPHmYn7tkPjnzg1cknH0dg1DLnj0kg1csP1D30AFG5HcsP0KVm1YLnHT1PjbdPHwxP1DLrjmYrHn3g1Dsn-ts0Z7spyfqn0Kkmv-b5H00ThIYmyTqn0K9mWYsg100ugFM5H00TZ0qnHmznjTLPjD1P6K8IM0qna3snj0snj0sn0KVIZ0qn0KbuAqs5H00ThCqn0KbugmqTAn0uMfqn0KspjYs0Aq15H00mMTqnH00UMfqn0K1XWY0IZN15Hm4n164rjmLnWTkrjR3nWfYn0Kzug7Y5HDdnHTYPWmknjTsnHc0Tv-b5HbkuHN9PH9bnj0sPyf1mHf0mLPV5HfknDPanRPjnjmsn1bdP1R0mynqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7t1nHb4nWKxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0APzm1Yzn1nz%26ck%3D8487.8.215.364.184.249.178.422%26shh%3Dwww.baidu.com%26us%3D1.0.1.0.0.0.0%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26oq%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26rqlang%3Dcn%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_265e1f_%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599; LGUID=20180201142155-33706401-0718-11e8-abe5-5254005c3644; JSESSIONID=ABAAABAAADEAAFICFF624F371A5FEB17B2849796CE3958D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517466112,1517466121; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_checkmore; X_HTTP_TOKEN=c539dccaf5258dc27cc566506a38daeb; _gat=1; SEARCH_ID=c0dda518081e4d1abbc6983e559569b7; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517467162; LGRID=20180201143925-a4dc17a1-071a-11e8-a3eb-525400f775ce', 'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36' } #經過多次驗證為這幾個headers內容 r1= requests.post(url,data = data,headers = headers) #post請求,傳入url和data引數和headers my_set.insert(r1.json()['content']['positionResult']['result']) #插入mongo中,字典格式 爬取單頁模板程式碼如下: #! /usr/bin/env python # -*- coding:utf-8 -*- from pymongo import MongoClient import requests client = MongoClient() db = client.lagou #建立一個lagou資料庫 my_set = db.job #建立job集合 url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0' payload = { 'first':'true', 'pn':'1', 'kd':'爬蟲', } headers = { 'Cookie':'', 'User-Agent':'', 'Referer':'', } #填入對應的headers資訊 response = requests.post(url, data = payload, headers = headers) #使用POST方法請求資料,加上payload和headers資訊 my_set.insert(response.json()['content']['positionResult']['result']) #把對應的資料儲存到MOngoDB 爬取多頁職位程式碼模板 #! /usr/bin/env python # -*- coding:utf-8 -*- import requests from pymongo import MongoClient import time from fake_useragent import UserAgent #fake-useragent,可以偽裝生成headers請求頭中的User Agent值 client = MongoClient() db = client.lagou lagou = db.PHP #建立PHP集合 headers = { 'Cookie':'', 'Referer':'', } #對應的headers資訊 def get_job_info(page, kd): #加入一個職位引數kd for i in range(page): url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0' payload = { 'first': 'true', 'pn': i, 'kd': kd, } ua = UserAgent() headers['User-Agent'] = ua.random #使用fake-Agent隨機生成User-Agent,新增到headers response = requests.post(url, data=payload, headers=headers) if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] lagou.insert(job_json) else: print('Something Wrong!') print('正在爬取' + str(i+1) + '頁的資料...') time.sleep(3) if __name__ == '__main__': get_job_info(3, 'PHP') #爬取前3頁的PHP職位資訊 爬取多頁職位資訊程式碼 #! /usr/bin/env python # -*- coding:utf-8 -*- import requests from pymongo import MongoClient import time from fake_useragent import UserAgent client = MongoClient() db = client.lagou #連線test資料庫,沒有會自動建立 my_set = db.job #使用set集合,沒有回自動建立 url ='https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0' headers = { 'Cookie': '_ga=GA1.2.637061448.1517466112; _gid=GA1.2.1676212693.1517466112; user_trace_token=20180201142155-337058a4-0718-11e8-abe5-5254005c3644; LGSID=20180201142155-33706056-0718-11e8-abe5-5254005c3644; PRE_UTM=m_cf_cpc_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0f0000aPftK-mFJ6dmD9WqYAqP4eFPlMwr50wN63SmmOfpGbUN06xZzlY-1Fy5zprFS2gaarx36xE2VDwuytXzmBrVLvsIw2jSLujeJByF5rndRHZMee5bfJkoDTZAFQgW1AIWaUr-zwYy_O-ZWFhqtBl6ASsCrLpEneH2AZLGJaGEuXds.7R_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzL4XNPIIhExz4rMThEgz3x5Gse5gj_L3x5x9L4n5VL3x5ksSEzselt5M33xUl3ISqi_nYQAl1u3qB6.U1Yk0ZDqs2v4_tL3dU30mywkXHL0oUh11xWNE6K9uZ7Y5Hc0TA-W5HD0IjL0oUh11xWNE6KGUHYznWR0u1dsThc0Iybqmh7GuZR0TA-b5HD0mv-b5Hn4n0KVIjYknjD4g1DsnHIxn10kPNt1PW0k0AVG5H00TMfqPWcY0ANGujYzPHmYn7tkPjnzg1cknH0dg1DLnj0kg1csP1D30AFG5HcsP0KVm1YLnHT1PjbdPHwxP1DLrjmYrHn3g1Dsn-ts0Z7spyfqn0Kkmv-b5H00ThIYmyTqn0K9mWYsg100ugFM5H00TZ0qnHmznjTLPjD1P6K8IM0qna3snj0snj0sn0KVIZ0qn0KbuAqs5H00ThCqn0KbugmqTAn0uMfqn0KspjYs0Aq15H00mMTqnH00UMfqn0K1XWY0IZN15Hm4n164rjmLnWTkrjR3nWfYn0Kzug7Y5HDdnHTYPWmknjTsnHc0Tv-b5HbkuHN9PH9bnj0sPyf1mHf0mLPV5HfknDPanRPjnjmsn1bdP1R0mynqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7t1nHb4nWKxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0APzm1Yzn1nz%26ck%3D8487.8.215.364.184.249.178.422%26shh%3Dwww.baidu.com%26us%3D1.0.1.0.0.0.0%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26oq%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26rqlang%3Dcn%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_265e1f_%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599; LGUID=20180201142155-33706401-0718-11e8-abe5-5254005c3644; JSESSIONID=ABAAABAAADEAAFICFF624F371A5FEB17B2849796CE3958D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517466112,1517466121; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_checkmore; X_HTTP_TOKEN=c539dccaf5258dc27cc566506a38daeb; _gat=1; SEARCH_ID=c0dda518081e4d1abbc6983e559569b7; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517467162; LGRID=20180201143925-a4dc17a1-071a-11e8-a3eb-525400f775ce', 'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=' } # 經過多次驗證為這幾個headers內容 def get_job_info(page,kd): #加入一個職位引數和頁面數量 for i in range(page): data ={ 'first':'false', 'pn':i, 'kd':kd } #引數部分 ua = UserAgent() headers['User-Agent'] = ua.random r1= requests.post(url,data = data,headers = headers) #post請求,傳入url和data引數和headers if r1.status_code == 200: #返回頁面狀態記住 my_set.insert(r1.json()['content']['positionResult']['result']) #插入mongo中,字典格式 else: print('Something Wrong!') print('正在爬取第%s頁'%str(i+1)) time.sleep(3) if __name__ == '__main__': get_job_info(26,'資料分析') #爬取前26頁的資料分析職位資訊