1. 程式人生 > 實用技巧 >04爬取拉勾網Python崗位分析報告

04爬取拉勾網Python崗位分析報告

# 匯入需要的包
import requests
import time,random
from openpyxl import Workbook
import pymysql.cursors
#@ 連線資料庫;
# 這個是我本地上邊執行的程式,用來獲取代理伺服器。
def get_proxy():
try:
PROXY_POOL_URL = 'http://localhost:5555/random'
response = requests.get(PROXY_POOL_URL)
print(response.text)
if response.status_code == 200:
return response.text
except ConnectionError:
return None
# 用來連線本地mysql,可以不連線,直接寫入Excel中
def get_conn():
"""連線本地資料庫"""
# 定義要連線的主機IP,賬號名稱和密碼,連線的資料庫,編碼等等
conn = pymysql.connect(host = 'localhost',
user = 'root',
password = '123456',
db = 'python',
charset = 'utf8mb4',
cursorclass = pymysql.cursors.DictCursor)
return conn
# 將資料寫入到資料庫中
def insert(conn,info):
"""資料寫入資料庫"""
with conn.cursor() as cursor:
sql = "INSERT INTO `python` (`companyShortName`, `companyFullName`, `industryField`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, info)
conn.commit()
# 獲取當前網址的資訊
def get_json(url,page,lang_name):
"""返回當前頁面的資訊列表"""
data = {'first':'false','pn':page,'kd':lang_name}
proxies = get_proxy()
proxies = {
"http": "http://" + proxies
}
json = ses.post(url,data,proxies = proxies).json()
list_con = json['content']['positionResult']['result']
info_list = []
for i in list_con:
info = []
info.append(i.get('companyShortName','無')) # 公司名稱
info.append(i.get('companyFullName','無'))
info.append(i.get('industryField','無'))
info.append(i.get('companySize','無'))
info.append(i.get('salary','無'))
info.append(i.get('city','無'))
info.append(i.get('education','無'))
info_list.append(info)
return info_list

def main():
lang_name = 'python'
wb = Workbook() # 開啟Excel工作薄
conn = get_conn() # 建立資料庫連線 不存放資料,註釋此行
for i in ['北京','上海','廣州','深圳','杭州']: #五個城市
page = 1
wsl = wb.active
wsl.title = lang_name
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
while page < 2: # 每個城市30頁資訊
info = get_json(url,page,lang_name)
page += 1
# time.sleep(random.randint(10,20))
for row in info:
# 插入資料庫,若不想存入 註釋此行
insert(conn,tuple(row))
wsl.append(row)
# 關閉資料庫連線,不存放資料,註釋此行
conn.close()
wb.save('{}職位資訊.xlsx'.format(lang_name))

if __name__ == "__main__":
my_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_Python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"Content-Type": "application/x-www-form-urlencoded;charset = UTF-8"
}
# time.sleep(5)
ses = requests.session() # 獲取 session
ses.headers.update(my_headers) # 更新
ses.get(
"https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=")
main()