1. 程式人生 > >百度貼吧獲取某個吧的全部圖片

百度貼吧獲取某個吧的全部圖片

完成 device dde emp bdc type 遍歷 4.0 感謝

"""
搜索百度貼吧單個貼吧內的所有帖子
使用xpath定位
完成翻頁功能
下載詳情頁中的所有圖片
"""
import re

import os
import requests
import time
from lxml import etree

# 下一頁的xpath //*[@id="frs_list_pager"]/a[contains(text(),‘下一頁‘)]/@href

# 正則改寫:<a rel="noreferrer" href="/p/5564366573" title="新人求指導,這三家店正規麽?萬分感謝!" target="_blank" class="j_th_tit ">新人求指導,這三家店正規麽?萬分感謝!</a>
# <a rel="noreferrer" href="/p/5532141331" title="都說布偶貓性格好 大家有被布偶伸爪子抓過嗎?" target="_blank" class="j_th_tit ">都說布偶貓性格好 大家有被布偶伸爪子抓過嗎?</a>

class CatBa(object):

def __init__(self,name):
self.name = name
self.url = ‘http://tieba.baidu.com/f?kw={}‘.format(name)
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) "
# ‘Cookie‘:‘BIDUPSID=DDE115B05C2CA4276EA17E431514BC87; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID=099C3C88481C218EE87AE81CC9711A7F:FG=1; PSTM=1519472311; H_PS_PSSID=1459_21094_22157; BDSFRCVID=OJKsJeCCxG39OkRAuJwbDkmyQoWuOCigBOma3J; H_BDCLCKID_SF=tJAj_D-btK03fP36qR6sMJ8thmT22-usB2QiQhcH0hOWsI8wXPRCLJLp5U5Ra-r8LCOULl7n2R3I8n6pDUC0-nDSHHKjJT-t3J; TIEBA_USERTYPE=8cff9d7553785bf414135144; TIEBAUID=cb23caae14130a0d384a57f1; bottleBubble=1; wise_device=0; bdshare_firstime=1519475463290; Hm_lvt_287705c8d9e2073d13275b18dbd746dc=1519475468; FP_UID=69c334ac70ace4caaeac308f190bb61d; Hm_lpvt_287705c8d9e2073d13275b18dbd746dc=1519475820; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1519474819,1519475411,1519475637,1519476450; PSINO=2; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1519479464‘
}


def get_page(self,url):
response = requests.get(url,headers=self.headers)
# 清洗一下註釋掉的內容

return response.content

def parse_data(self,data):

"""
1.通過正則收集頁面上所有的詳情URL,存入列表中
2.收集完本頁的url後,用xpath尋找下一頁
3.如果存在則遍歷下一頁,如果不存在則開始匹配圖片
:return:
"""
# <a rel="noreferrer" href="/p/5564366573" title="新人求指導,這三家店正規麽?萬分感謝!" target="_blank" class="j_th_tit ">新人求指導,這三家店正規麽?萬分感謝!</a>
# temp_list = re.findall(r‘<a rel="noreferrer" href="//p//(/d+)" title=".*?" target="_blank" class="j_th_tit ">.*?<//a>‘, data,re.DOTALL)
# 生成element對象
url_list = []
html = etree.HTML(data)
temp_list = html.xpath(‘//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a/@href‘)
url_list.extend(temp_list)
time.sleep(5)
# 調用xpath方法
# 匹配不出來的xpath://*[@id="frs_list_pager"]/a[contains(text(),"下一頁")]/@href
try:
next_url = html.xpath(‘//*[@id="frs_list_pager"]/a[contains(text(),"下一頁")]/@href‘)[0]
except Exception as e:
return None,url_list
return next_url,url_list

def get_pic_url(self,data):
html = etree.HTML(data)
pic_url_list = html.xpath(‘//div[contains(@id,"post_content_")]/img/@src‘)
return pic_url_list

def open_pic(self,url):
try:
response = requests.get(url,headers=self.headers)
except Exception as e:
print(‘打開圖片出錯‘)
return None

return response.content


def download_pic(self,pic_url_list):
if not os.path.exists(self.name):
os.mkdir(self.name)

for each_url in pic_url_list:
if ‘emoticon‘ not in each_url:
name = self.name + os.sep + each_url.split(‘/‘)[-1]
pic_content = self.open_pic(each_url)
try:
with open(name,‘wb‘) as f:
f.write(pic_content)
except Exception as e:
pass

def run(self):
while True:
data = self.get_page(self.url)
self.url,url_list = self.parse_data(data)
print(self.url)
print(url_list)
for each_url in url_list:
real_url = ‘http://tieba.baidu.com‘ + each_url
html = self.get_page(real_url)
pic_url_list = self.get_pic_url(html)
self.download_pic(pic_url_list)

if not self.url:
break
else:
self.url = ‘http:‘ + self.url


if __name__ == ‘__main__‘:
cat = CatBa(‘李毅‘)

#此處修改貼吧名字即可
cat.run()

百度貼吧獲取某個吧的全部圖片