豆瓣小組爬蟲
阿新 • • 發佈:2017-10-28
.html 不存在 lin rom 是否 inpu clas main safari
豆瓣小組上有很多小組裏面的圖片非常的好
所以我們給爬下來
if __name__=="__main__": url = raw_input(‘請輸入小組字符串代碼如:haixiuzu‘) beginPage = int(raw_input(‘請輸入起始頁碼‘)) endPage= int(raw_input(‘請輸入結束頁碼‘)) #構建url url = ‘https://www.douban.com/group/‘+url+‘/discussion?start=‘ #獲取所有url getPageLink(url,beginPage,endPage)
先構建主頁url
接著用xpath獲取需要爬的帖子鏈接
def getPageLink(url,begin,end): """ 獲取小組主頁全部帖子鏈接 :param url: :return: """ #構建所有URL urlList = [] #構建所有需要獲取的鏈接 for page in range(begin,end+1): pn = (page - 1) * 25 urlList.append(str(url)+str(pn)) #構建head headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘} #存放所有帖子鏈接 linkList = [] for i in urlList: Request = urllib2.Request(i,headers=headers) html = urllib2.urlopen(Request).read() content=etree.HTML(html)#用xpath獲取鏈接 tempList=content.xpath(‘//td[@class="title"]/a/@href‘) for t in tempList: linkList.append(t) getImgLink(linkList)
接著獲取所有圖片的鏈接
def getImgLink(url): """ 獲取帖子裏所有圖片的鏈接 :param url: :return: """ headers = { ‘Connection‘: ‘keep-alive‘, ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘Referer‘: ‘https://www.douban.com/group/haixiuzu/discussion?start=0‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘ } imgUrl = [] for t in url: request = urllib2.Request(t,headers=headers) html = urllib2.urlopen(request).read() content = etree.HTML(html) #用xpath獲取所有圖片鏈接 tempList = content.xpath(‘//div[@class="topic-content"]/div[@class="topic-figure cc"]/img/@src‘) for t in tempList: imgUrl.append(t) savaImg(imgUrl)
然後當然是保存所有圖片了
def savaImg(imgList): """ 保存圖片到本地 :param imgList: :return: """ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘} #判斷目錄是否存在 isExists = os.path.exists(‘img‘) #如果不存在創建 if not isExists: os.makedirs(‘img‘) for t in imgList: request = urllib2.Request(t,headers=headers) img = urllib2.urlopen(request).read() fileName = t[-10:] #寫圖片到指定目錄 with open(‘img\\‘+fileName,"wb") as writ: writ.write(img)
以上僅供學習交流使用
豆瓣小組爬蟲