爬取必應首頁大圖 - Python
阿新 • • 發佈:2020-12-16
不廢話,直接上程式碼
# -*- coding: utf-8 -*- # @Author: Wang Hongbin # @Email: [email protected] # @Date: 2018-03-16 14:19:27 # @Last Modified by: Wang Hongbin # @Last Modified time: 2018-03-28 16:26:07 import requests import re import os import time #時間模組 local = time.strftime("%Y-%m-%d_") baseUrl = "https://cn.bing.com" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} def getImgUrl(url): reg1 = r"(/az/hprichbg/rb/.*?.jpg)" con = requests.get(url) content = con.text imgUrl = re.findall(reg1, content, re.S)[0] # imgLink = baseUrl+imgUrl return imgUrl def getFilePath(): # filePath = '/var/www/html/biYinPic/images/' + time.strftime("%Y%m%d") + '/' filePath = 'C:/Users/Administrator/Pictures/MyDesktop/' if not os.path.exists(filePath): os.mkdir(filePath) return filePath def getImgName(url): reg2 = r"/az/hprichbg/rb/(.*?)_" imgName = re.findall(reg2, url, re.S)[0] imgName = local + imgName + '.jpg' return imgName def downloadByPic(url): imgUrl = getImgUrl(url) imgName = getImgName(imgUrl) filePath = getFilePath() fileName = filePath+imgName picUrl = baseUrl + imgUrl read = requests.get(picUrl) f = open(fileName, 'wb') f.write(read.content) f.close() # reg3 = r'<div class=\"hplaCata\"><div class=\"hplatt\">(.*)</div><div class=\"hplats\">(.*)</div><div id=\"hplaSnippet\">(.*)</div><div class=\"hplaPvd\">(.*)</div>' downloadByPic(baseUrl) print('is ok!')
爬取結果
下圖是七月份至今的爬取圖片,因為是在window上執行的,電腦不開機的時候不會執行,程式碼放在Linux上執行也沒問題,使用crontab啟個定時器就行了