【整理】【轉載】爬蟲相關
(1) 抓取小說--轉
import requests
import re
from bs4 import BeautifulSoup
if __name__=='__main__':
headers={"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'}
# url='http://z8.cnzz.com/stat.htm?id=1273371515&r=&lg=zh-cn&ntime=1543752090&cnzz_eid=1445241018-1543752090-&showp=1280x720&t=最強惡魔妖孽系統_最強惡魔妖孽系統最新章節_最強惡魔妖孽系統最新章節列表_全書網&umuuid=1676ee2c2530-045ac19542698d8-4c312979-e1000-1676ee2c25463&h=1&rnd=1980725067'
url='http://www.shushu8.com/shaolinbajue/'
r= requests.get(url,headers=headers)#.content
r.encoding=r.apparent_encoding
#print(r.text)
d=BeautifulSoup(r.text,'lxml')
t=d.select('.clearfix > ul > li > a')
# print(t)
for i in t:
deta={'href': i.get('href'),
'標題': i.get_text()}
urlk='http://www.shushu8.com'+deta['href']
jsu=requests.get(urlk)
jsu.encoding=jsu.apparent_encoding
di=BeautifulSoup(jsu.text,'lxml')
ti=di.select('div.page-content')
for k in ti:
print(k.get_text())
(2) 抓取網頁圖片
https://blog.csdn.net/caozewei/article/details/82497388
1、根據給定的網址獲取網頁原始碼
2、利用正則表示式把原始碼中的圖片地址過濾出來
3、根據過濾出來的圖片地址下載網路圖片
import re
import urllib.request
def gethtml(url):
page=urllib.request.urlopen(url)
html=page.read()
return html
def getimg(html):
reg = r'src="(.*?\.jpg)"'
img=re.compile(reg)
html=html.decode('utf-8')#python3
imglist=re.findall(img,html)
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'%s.jpg'%x)
x = x+1
html=gethtml("http://news.ifeng.com/a/20161115/50258273_0.shtml")
print(getimg(html))
把程式碼直接匯入直譯器,可直接執行抓取圖片
(3)其他
https://blog.csdn.net/sinat_37390744/article/details/55533360
https://blog.csdn.net/sinat_37390744/article/details/55670553
https://blog.csdn.net/qq_32252957/article/details/78997021
https://blog.csdn.net/qq_32252957/article/details/78441293
https://blog.csdn.net/qq_32252957/article/details/78961867