網路爬蟲的簡易實現(1)
阿新 • • 發佈:2018-12-25
這個爬蟲主要實現對http://pic.yesky.com這個網站圖片的爬取;
import urllib import urllib2 import re import time from bs4 import BeautifulSoup send_headers = {} send_headers["Host"] = "http://pic.yesky.com" send_headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.3228.1 Safari/537.36" send_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" send_headers["Connection"] = "keep-alive" #print(send_headers) #f = urllib2.urlopen("https://www.4399.com") req = urllib2.Request("http://pic.yesky.com") #偽裝頭部的請求 #print(req.headers) f = urllib2.urlopen(req) html = f.read() print(html) soup = BeautifulSoup(html) yy = soup.select("img") #按標籤進行查詢 print(yy) ruffix = "jpg" #儲存字尾 i = 1 for temp in yy: # print(temp['src']) print('-'*50) print(temp.prettify()) str =temp['src'].encode('gbk') #unicode編碼轉為string型別,查詢字尾 ruffix = str[str.rfind('.'):str.rfind('.')+4:1] print(ruffix) print(type(str)) if str.find("htt") != -1: print(temp['src']) urllib.urlretrieve(temp['src'],filename="/home/lxt/Desktop/pach/4399Pic/%d%s"%(i,ruffix)) #儲存圖片到本地 i+=1 time.sleep(1)