同步爬取天虹商城圖片連結
阿新 • • 發佈:2018-11-09
import requests import time from bs4 import BeautifulSoup import re #判斷連結開啟是否正常 def get_url(url): response=requests.get(url) if response.status_code==200: print('%s' % url) print('success') else: print('%s' % url) print('fail') #獲取當前主頁的最大頁數 def get_page_max(): rep=requests.get('http://www.tianhong.cn/list-5835.html') page_soup=BeautifulSoup(rep.text,'html.parser') page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text return page_max #獲取當前主頁廣告、logo連結 def get_main_html_pageurl(url): rep_pictureurl=[] rep=requests.get(url) rep_page=BeautifulSoup(rep.text,'html.parser') rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src') rep_pictureurl.append(rep_page_url) rep_logo=rep_page.find('div',class_='logo').find('img').get('src') rep_pictureurl.append('http://www.tianhong.cn'+rep_logo) return rep_pictureurl #獲取當前頁的商品圖片連結 def get_main_pictureurl(url): rep_pictureurl=[] rep=requests.get(url) rep_page=BeautifulSoup(rep.text,'html.parser') rep_page_url=rep_page.find('ul',class_='spList').find_all('img') for line in rep_page_url: line=re.findall(r'.*src="(.*)" .*',str(line))[0] rep_pictureurl.append(line) return rep_pictureurl #獲取當前頁面商品連結 def get_commodity_url(url): rep_url=[] rep=requests.get(url) page_soup=BeautifulSoup(rep.text,'html.parser') page_url=page_soup.find('ul',class_='spList').find_all('a') for line in page_url: line=re.findall(r'.*a href="(.*)" tag=.*',str(line)) rep_url.extend(line) return rep_url #獲取商品詳情頁的圖片連結 def get_Details_url(url): rep_url=[] rep=requests.get(url) page_soup=BeautifulSoup(rep.text,'html.parser') page_url=page_soup.find('div',class_='m1l').find_all('a') for line in page_url: line1=re.findall('"(http.*?)"',str(line)) line2=re.findall(r'\'(http.*?)\'',str(line)) rep_url.extend(line1) rep_url.extend(line2) details_url=page_soup.find('div',class_='box').find_all('img') for lines in details_url: rep_url.append(lines.get('src')) return rep_url #判斷當前頁的所有圖片是否可以正常開啟 def run_main(): start=time.time() for i in range(1,int(get_page_max())+1): url='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i) get_url(url) for line in(get_main_html_pageurl(url)+get_main_pictureurl(url)):#主頁面圖片連結 get_url(line) for lines in(get_commodity_url(url)):#商品連結 lines='http://www.tianhong.cn'+lines get_url(lines) for j in(get_Details_url(lines)):#商品詳情圖片連結 get_url(j) print('完成第',i,'頁') end=time.time() print(end-start) if __name__=='__main__': run_main()