1. 程式人生 > >網路爬蟲的簡易實現(1)

網路爬蟲的簡易實現(1)

這個爬蟲主要實現對http://pic.yesky.com這個網站圖片的爬取;

import urllib
import urllib2
import re
import time
from bs4 import BeautifulSoup
send_headers = {}
send_headers["Host"] = "http://pic.yesky.com"
send_headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.3228.1 Safari/537.36"
send_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
send_headers["Connection"] = "keep-alive"
#print(send_headers)


#f = urllib2.urlopen("https://www.4399.com")
req = urllib2.Request("http://pic.yesky.com")    #偽裝頭部的請求
#print(req.headers)
f = urllib2.urlopen(req)
html = f.read()
print(html)
soup = BeautifulSoup(html)
yy = soup.select("img")    #按標籤進行查詢
print(yy)
ruffix = "jpg"     #儲存字尾


i = 1
for temp in yy:
#	print(temp['src'])
	print('-'*50)
	print(temp.prettify())
	str =temp['src'].encode('gbk')     #unicode編碼轉為string型別,查詢字尾
	ruffix = str[str.rfind('.'):str.rfind('.')+4:1]
	print(ruffix)
	print(type(str))
	if str.find("htt") != -1:
		print(temp['src'])
		urllib.urlretrieve(temp['src'],filename="/home/lxt/Desktop/pach/4399Pic/%d%s"%(i,ruffix))   #儲存圖片到本地
		i+=1
		time.sleep(1)