1. 程式人生 > >7.1 python拉勾網實戰並儲存到mongodb

7.1 python拉勾網實戰並儲存到mongodb

拉鉤網實戰 爬取拉勾網有關“爬蟲”的職位資訊,並把爬取的資料儲存在MongoDB資料庫中
  1. 確定網頁的載入方式是JavaScript載入
  2. 通過谷歌瀏覽器開發者工具分析和尋找網頁的真實請求,確定真實資料在position.Ajax開頭的連結裡,請求方式是POST
  3. 使用requests的post方法獲取資料,發現並沒有返回想要的資料,說明需要加上headers
  4. 加上headers的’Cookie’,’User-Agent’,’Referer’等資訊,成功返回資料
  5. 再把返回的對應資料儲存到MongoDB
提示:一般js載入的都在xhr和js中 1 首先經過分析,是post請求,攜帶引數如下 2 post請求如何使用
百度搜索requests post與get不同的是,post多傳一個data引數,如第一個圖中紅框中下部分form data資料。js格式和get一樣傳入headers引數。
#! /usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from pymongo import MongoClient

client = MongoClient()
db = client.lagou  #連線test資料庫,沒有會自動建立
my_set = db.job   #使用set集合,沒有回自動建立


url ='https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0'

data ={
    'first':'false',
    'pn':'2',
    'kd':'資料分析'
} #引數部分

headers = {
'Cookie':'_ga=GA1.2.637061448.1517466112; _gid=GA1.2.1676212693.1517466112; user_trace_token=20180201142155-337058a4-0718-11e8-abe5-5254005c3644; LGSID=20180201142155-33706056-0718-11e8-abe5-5254005c3644; PRE_UTM=m_cf_cpc_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0f0000aPftK-mFJ6dmD9WqYAqP4eFPlMwr50wN63SmmOfpGbUN06xZzlY-1Fy5zprFS2gaarx36xE2VDwuytXzmBrVLvsIw2jSLujeJByF5rndRHZMee5bfJkoDTZAFQgW1AIWaUr-zwYy_O-ZWFhqtBl6ASsCrLpEneH2AZLGJaGEuXds.7R_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzL4XNPIIhExz4rMThEgz3x5Gse5gj_L3x5x9L4n5VL3x5ksSEzselt5M33xUl3ISqi_nYQAl1u3qB6.U1Yk0ZDqs2v4_tL3dU30mywkXHL0oUh11xWNE6K9uZ7Y5Hc0TA-W5HD0IjL0oUh11xWNE6KGUHYznWR0u1dsThc0Iybqmh7GuZR0TA-b5HD0mv-b5Hn4n0KVIjYknjD4g1DsnHIxn10kPNt1PW0k0AVG5H00TMfqPWcY0ANGujYzPHmYn7tkPjnzg1cknH0dg1DLnj0kg1csP1D30AFG5HcsP0KVm1YLnHT1PjbdPHwxP1DLrjmYrHn3g1Dsn-ts0Z7spyfqn0Kkmv-b5H00ThIYmyTqn0K9mWYsg100ugFM5H00TZ0qnHmznjTLPjD1P6K8IM0qna3snj0snj0sn0KVIZ0qn0KbuAqs5H00ThCqn0KbugmqTAn0uMfqn0KspjYs0Aq15H00mMTqnH00UMfqn0K1XWY0IZN15Hm4n164rjmLnWTkrjR3nWfYn0Kzug7Y5HDdnHTYPWmknjTsnHc0Tv-b5HbkuHN9PH9bnj0sPyf1mHf0mLPV5HfknDPanRPjnjmsn1bdP1R0mynqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7t1nHb4nWKxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0APzm1Yzn1nz%26ck%3D8487.8.215.364.184.249.178.422%26shh%3Dwww.baidu.com%26us%3D1.0.1.0.0.0.0%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26oq%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26rqlang%3Dcn%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_265e1f_%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599; LGUID=20180201142155-33706401-0718-11e8-abe5-5254005c3644; JSESSIONID=ABAAABAAADEAAFICFF624F371A5FEB17B2849796CE3958D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517466112,1517466121; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_checkmore; X_HTTP_TOKEN=c539dccaf5258dc27cc566506a38daeb; _gat=1; SEARCH_ID=c0dda518081e4d1abbc6983e559569b7; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517467162; LGRID=20180201143925-a4dc17a1-071a-11e8-a3eb-525400f775ce',
    'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'
} #經過多次驗證為這幾個headers內容

r1= requests.post(url,data = data,headers = headers) #post請求,傳入url和data引數和headers

my_set.insert(r1.json()['content']['positionResult']['result']) #插入mongo中,字典格式
爬取單頁模板程式碼如下:
#! /usr/bin/env python
# -*- coding:utf-8 -*-
from pymongo import MongoClient
import requests
client = MongoClient()
db = client.lagou #建立一個lagou資料庫
my_set = db.job #建立job集合
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
payload = {
    'first':'true',
    'pn':'1',
    'kd':'爬蟲',
}
headers = {
    'Cookie':'',
    'User-Agent':'',
    'Referer':'',
} #填入對應的headers資訊
response = requests.post(url, data = payload, headers = headers) #使用POST方法請求資料,加上payload和headers資訊
my_set.insert(response.json()['content']['positionResult']['result']) #把對應的資料儲存到MOngoDB
爬取多頁職位程式碼模板
#! /usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from pymongo import MongoClient
import time
from fake_useragent import UserAgent #fake-useragent,可以偽裝生成headers請求頭中的User Agent值
client = MongoClient()
db = client.lagou
lagou = db.PHP #建立PHP集合
headers = {
            'Cookie':'',
            'Referer':'',
        } #對應的headers資訊
def get_job_info(page, kd): #加入一個職位引數kd
    for i in range(page):
        url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
        payload = {
            'first': 'true',
            'pn': i,
            'kd': kd,
        }
        ua = UserAgent() 
        headers['User-Agent'] = ua.random #使用fake-Agent隨機生成User-Agent,新增到headers
        response = requests.post(url, data=payload, headers=headers)
        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            lagou.insert(job_json)
        else:
            print('Something Wrong!')
        print('正在爬取' + str(i+1) + '頁的資料...')
        time.sleep(3)
if __name__ == '__main__':
    get_job_info(3, 'PHP') #爬取前3頁的PHP職位資訊
爬取多頁職位資訊程式碼
#! /usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from pymongo import MongoClient
import time
from fake_useragent import UserAgent

client = MongoClient()
db = client.lagou  #連線test資料庫,沒有會自動建立
my_set = db.job   #使用set集合,沒有回自動建立

url ='https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0'

headers = {
    'Cookie': '_ga=GA1.2.637061448.1517466112; _gid=GA1.2.1676212693.1517466112; user_trace_token=20180201142155-337058a4-0718-11e8-abe5-5254005c3644; LGSID=20180201142155-33706056-0718-11e8-abe5-5254005c3644; PRE_UTM=m_cf_cpc_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0f0000aPftK-mFJ6dmD9WqYAqP4eFPlMwr50wN63SmmOfpGbUN06xZzlY-1Fy5zprFS2gaarx36xE2VDwuytXzmBrVLvsIw2jSLujeJByF5rndRHZMee5bfJkoDTZAFQgW1AIWaUr-zwYy_O-ZWFhqtBl6ASsCrLpEneH2AZLGJaGEuXds.7R_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzL4XNPIIhExz4rMThEgz3x5Gse5gj_L3x5x9L4n5VL3x5ksSEzselt5M33xUl3ISqi_nYQAl1u3qB6.U1Yk0ZDqs2v4_tL3dU30mywkXHL0oUh11xWNE6K9uZ7Y5Hc0TA-W5HD0IjL0oUh11xWNE6KGUHYznWR0u1dsThc0Iybqmh7GuZR0TA-b5HD0mv-b5Hn4n0KVIjYknjD4g1DsnHIxn10kPNt1PW0k0AVG5H00TMfqPWcY0ANGujYzPHmYn7tkPjnzg1cknH0dg1DLnj0kg1csP1D30AFG5HcsP0KVm1YLnHT1PjbdPHwxP1DLrjmYrHn3g1Dsn-ts0Z7spyfqn0Kkmv-b5H00ThIYmyTqn0K9mWYsg100ugFM5H00TZ0qnHmznjTLPjD1P6K8IM0qna3snj0snj0sn0KVIZ0qn0KbuAqs5H00ThCqn0KbugmqTAn0uMfqn0KspjYs0Aq15H00mMTqnH00UMfqn0K1XWY0IZN15Hm4n164rjmLnWTkrjR3nWfYn0Kzug7Y5HDdnHTYPWmknjTsnHc0Tv-b5HbkuHN9PH9bnj0sPyf1mHf0mLPV5HfknDPanRPjnjmsn1bdP1R0mynqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7t1nHb4nWKxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0APzm1Yzn1nz%26ck%3D8487.8.215.364.184.249.178.422%26shh%3Dwww.baidu.com%26us%3D1.0.1.0.0.0.0%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26oq%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599%26rqlang%3Dcn%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_265e1f_%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%25E7%25AB%2599; LGUID=20180201142155-33706401-0718-11e8-abe5-5254005c3644; JSESSIONID=ABAAABAAADEAAFICFF624F371A5FEB17B2849796CE3958D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517466112,1517466121; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_checkmore; X_HTTP_TOKEN=c539dccaf5258dc27cc566506a38daeb; _gat=1; SEARCH_ID=c0dda518081e4d1abbc6983e559569b7; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1517467162; LGRID=20180201143925-a4dc17a1-071a-11e8-a3eb-525400f775ce',
    'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput='
}  # 經過多次驗證為這幾個headers內容

def get_job_info(page,kd): #加入一個職位引數和頁面數量
    for i in range(page):


        data ={
            'first':'false',
            'pn':i,
            'kd':kd
        } #引數部分
        ua = UserAgent()
        headers['User-Agent'] = ua.random

        r1= requests.post(url,data = data,headers = headers) #post請求,傳入url和data引數和headers
        if r1.status_code == 200: #返回頁面狀態記住
            my_set.insert(r1.json()['content']['positionResult']['result']) #插入mongo中,字典格式
        else:
            print('Something Wrong!')
        print('正在爬取第%s頁'%str(i+1))
        time.sleep(3)

if __name__ == '__main__':
    get_job_info(26,'資料分析')  #爬取前26頁的資料分析職位資訊
程式碼執行結果如下: 搜尋關鍵字的方法為以上的程式碼。 而抓取某個類目下的所有頁面職位,可以直接修改上面位址列連結中的數字即可 補充知識:類,函式,方法的區別與聯絡: 函式,就是return一個物件。方法,主要在class中提到較多,建立類時,可以定義一個特定的方法,名為__init__(),只要建立這個類的一個例項就會執行這個方法。可以向__init__()方法傳遞引數,這樣建立物件時就可以把屬性設定為你希望的值 在類地內部,使用def關鍵字可以為類定義一個方法,與一般函式定義不同,類方法必須包含引數self,且為第一個引數 沒太大區別,只是方法的話,前面要加一個 self引數 共同點是,都要return一個物件
微博爬取難點,js載入,請求不好找 當pc端不好爬時可以考慮移動端,移動端反爬比較少。即m.網址 移動端。