爬圖交互界面及翻頁初嘗式
阿新 • • 發佈:2017-09-14
語法 aso raw star write conn aid zip pen
# -*- coding:utf-8 -*- import requests, re, os,urllib2 class TP: def __init__(self,baseUrl): #baseUrl是基本地址 #url = ‘http://tieba.baidu.com/p/5307547413‘ self.baseURL=baseUrl #傳入頁碼,獲取該頁帖子的代碼 def getPage(self,pageNum): try: url=self.baseURL+‘?pn=‘+str(pageNum) res=urllib2.Request(url) html=urllib2.urlopen(res).read().decode(‘UTF-8‘) return html except urllib2.UrlError,e:#為什麽這麽寫 if hasattr(e,"reason"):#hasattr是什麽意思 print u‘錯誤‘,e.reason return None def getPageNum(self,page): #獲取頁碼 page=self.getPage(1)#這個是第一頁的網址信息 pattern=re.compile(‘<li class="l_reply_num" .*?</span>.*?<span.*?>(.*?)</span>‘,re.S) result=re.search(pattern,page) if result: return result.group(1).strip() else: return None #提取圖片 def getContent(self,html): header= { ‘Accept‘: ‘*/*‘, ‘Accept-Encoding‘:‘gzip,deflate,sdch‘, ‘Accept-Language‘:‘zh-CN,zh;q=0.8‘, ‘Connection‘:‘keep-alive‘ } html = requests.get(url,headers = header) data = html.content.decode(‘utf-8‘) find = re.compile(r‘<img class="BDE_Image" src="(.*?).jpg"‘) result = find.findall(data) for img_url in result: name = img_url.split(‘/‘)[-1] img_url = img_url+‘.jpg‘ html = requests.get(img_url,headers = header) im = html.content with open(name+‘.jpg‘,‘wb‘)as f: f.write(im) def start(self): indexPage=self.getPage(1) pageN=self.getPageNum(indexPage) if pageN==None: print "URL error" return try: print u‘該帖子有‘+str(pageN)+‘頁!‘# for i in range(1,int(pageN)+1): print u‘正在讀入第‘+str(i)+‘頁數據‘ page=self.getPage(i) contents=self.getContent(page) except IOError,e: print u‘正在寫入第‘+str(i)+‘頁數據‘ finally: print u‘爬取任務完成^_^‘ print u‘請寫入帖子號碼‘ baseUrl=‘http://tieba.baidu.com/p/‘+str(raw_input(u‘http://tieba.baidu.com/p/‘)) pt=TP(baseUrl) pt.start()
問題尚未完成,無法翻頁且圖片出不來,明天瞅瞅語法仔細盤盤邏輯
爬圖交互界面及翻頁初嘗式