簡易爬蟲爬取51job招聘資訊
阿新 • • 發佈:2019-02-20
import re
import urllib.request
class Grab(object):
# 定義類屬性
num = 0
def __init__(self):
# 請求的網址
self.url = "http://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?"
# 請求頭
self.headers = {"Host": "search.51job.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
def openurl(self):
# 建立請求物件
fin_url = urllib.request.Request(url=self.url,headers=self.headers)
# 請求網址
read_data = urllib.request.urlopen(fin_url)
# 讀取內容
data = read_data.read().decode("gbk")
# 正則匹配字元,取出存放資訊的連結
mes_list = re.findall(r"http://jobs\.51job\.com.+\.html" , data)
# 遍歷存放連結的列表
for i in mes_list:
self.deal(i)
def deal(self,url):
# 請求取出的網址
files = urllib.request.urlopen(url)
# 讀取網站
data = files.read()
# 正則匹配內容
find_list = re.findall(r"<p>.*</p>",data.decode("gbk" ))
# 開啟檔案寫入
new_file = open("zhaopin"+"/"+str(Grab.num)+".txt","w")
new_file.write(str(find_list))
new_file.close()
Grab.num += 1
def main():
# 建立物件
g = Grab()
g.openurl()
if __name__ == "__main__":
main()