python百度貼吧圖片下載指令碼例項
阿新 • • 發佈:2018-12-15
功能介紹: 對百度貼吧內的圖片進行下載; python版本: python2.7 用到的庫: urllib,requests
核心原理
使用urllib庫爬取貼吧頁面的圖片連結,將其進行下載;requests用於獲取當前訪問頁面返回狀態碼;
urllib.urlopen(url).read() urllib.urlretrieve(pictures,Path_img) requests.get(url).status_code
原理簡單不用多說直接上code
code
#!/usr/bin/Python
# -*- coding: utf-8 -*-
__author__ = "Man_ge"
import urllib
import requests
import time,re,os,sys,random
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')
#儲存路徑
LOCAL_PATH = "C:\\Users\\Administrator\\Desktop\\meinv4\\"
#basic function
class TB_get:
def __init__(self):
pass
#獲取html
def get_html(self,url):
page = urllib.urlopen(url) .read()
return page
#獲取url狀態
def get_state(self,url):
code=requests.get(url).status_code
return code
#獲取網頁title
def get_title(self,url):
reg = r'<title>(.*?)</title>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
return data[0].decode('UTF-8' ).encode('GBK')
#獲取回覆資訊
def get_Replypost(self,url):
reg = r'l_reply_num.*?</li>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
info = re.compile(r'<span .*?>(.*?)</span>')
info_data = re.findall(info, str(data))
return int(info_data[0])
#頁數
def get_pagenumber(self,url):
reg = r'l_reply_num.*?</li>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
info = re.compile(r'<span .*?>(.*?)</span>')
info_data = re.findall(info, str(data))
return int(info_data[1])
class TB_filter:
def __init__(self,html_page):
self.data=html_page
#匹配所有<href>
def filter_href(self):
reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data
#匹配所有<a>
def filter_a(self):
reg = r'<a .*?>(.*?)</a>'
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data
#匹配所有 src:
def filter_src(self):
reg = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data
#下載功能; 下載 png,jpg
def download_img(path_html):
tb = TB_get()
print "Title : ",tb.get_title(path_html)
if 'page404' in tb.get_html(path_html):
print u"很抱歉,該貼已被刪除。"
else:
print "state : ",tb.get_state(path_html)
save_path=LOCAL_PATH+tb.get_title(path_html)+"\\"
isExists=os.path.exists(save_path)
if not isExists:
os.makedirs(save_path)
page_number = tb.get_pagenumber(path_html)#獲取當前貼吧的頁數
print u"頁數 : ",page_number
print u"回覆貼 : ",tb.get_Replypost(path_html)
download_page = 0
while download_page < page_number:
download_html=path_html+'?pn='+str(download_page+1)#對每頁進行下載
print "\n\nstart access : ",download_html
state_code=tb.get_state(download_html)
print "state : ",state_code
if tb.get_state(download_html) == 200:#如果狀態是200就可以下載 否則不能下載
page_data = tb.get_html(download_html)
fl = TB_filter(page_data)
data = fl.filter_src()
pictures_number=0
for pictures in data:
pictures_number+=1
if pictures.split(".")[-1] in ["png","jpg"]:#篩選出 png,jpg為字尾的圖片格式進行下載
http_1=str(pictures.split("/")[0])
if http_1=="https:":
name= str(pictures.split("/")[-1])
tt= int(time.time())
newname=str(tt)+".jpg"
Path_img=save_path+newname
imgname=str(name.split("_")[0])
if imgname != "image" and '?' not in name:
print "\nstart download ====> "+name
print "loading......."
urllib.urlretrieve(pictures,Path_img)
print "download succees ====> "+newname
time.sleep(1)
else:
print "access failed!! state : ",state_code
download_page+=1
#下載器 只需要給定帖子路徑,和帖子頁數
def downloader(tb_path,tb_pg):
tb_path='https://tieba.baidu.com/f?kw='+tb_path+'&ie=utf-8&pn='+str((tb_pg-1)*50)
#print tb_path
tb = TB_get()
get_all_tb=tb.get_html(tb_path)
if tb.get_state(tb_path) == 200:
print "\n\nAccess : ",tb_path
reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, get_all_tb)
for tb_link in data:
reg1 = r'//tieba.baidu.com/p/.{0,}|/p/.{0,}'
reger1 = re.compile(reg1)
all_tb_link = re.findall(reger1, tb_link)
if all_tb_link != []:#獲取當前頁數的貼吧的所有帖子
assign_link=str(all_tb_link).split("/p")[-1]
assign_link=str(assign_link)[0:-2]
donwload_link= "https://tieba.baidu.com/p"+assign_link
print donwload_link
download_img(donwload_link)
else:
print "access failed!! state : ",state_code
if __name__ == '__main__':
n=0
#下載美女貼吧1到10頁的每個帖子裡的圖片,一共500個帖子的圖片
while n<10:
downloader('美女',n+1)
n+=1