1. 程式人生 > 其它 >爬取51job招聘資訊(一)

爬取51job招聘資訊(一)

目標,將網頁上的內容爬取下來,並實現翻頁,儲存為csv。

import os
from concurrent.futures.thread import ThreadPoolExecutor
from threading import Thread

import requests
from re import findall
from json import loads
import time
import pymysql
from multiprocessing import Queue

import csv
# 獲取每頁的內容,定義一個函式
def get_one_page(page, city_code='
000000'): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36' } url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,資料分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
' response = requests.get(url, headers=headers) if response.status_code == 200: json_data = findall(r'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>', response.text)[0] return loads(json_data)['engine_search_result'] else: print('請求失敗!')
# 需要多少頁!
start_page=1
ts
=[] for i in range(10): result = get_one_page(start_page) if not result: print('沒有更多資料') break ts.append(result) start_page += 1
#data_1 = get_one_page(1) #嘗試儲存一頁的內容
data_1=[] # 建立空列表,用於儲存多頁


for i in range(len(ts)):
    for j in range(50):#一頁50條
        data_1.append(ts[i][j])
# 我需要儲存的資訊

jobs = []
for job in data_1:
    job_info = [job.get('job_name'),
               job.get('providesalary_text'),
               job.get('company_name'),
               job.get('companytype_text'),
               job.get('workarea_text'),
               '-'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])),
               job.get('jobwelf')
               ]
    jobs.append(job_info)
name=['job_name','providesalary_text','company_name','companytype_text','workarea_tex','attribute_text','jobwelf']
test=pd.DataFrame(columns=name,data=jobs)
test.to_csv("testcsv.csv") # 儲存為csv格式
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            500 non-null    object
 1   providesalary_text  500 non-null    object
 2   company_name        500 non-null    object
 3   companytype_text    500 non-null    object
 4   workarea_tex        500 non-null    object
 5   attribute_text      500 non-null    object
 6   jobwelf             500 non-null    object
dtypes: object(7)
memory usage: 27.5+ KB


重要參考:https://gitee.com/wenhaha8/job51_analysis