使用Selenium爬取淘寶商品
阿新 • • 發佈:2018-12-13
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.common.exceptions import TimeoutException import time browser=webdriver.Firefox() wait=WebDriverWait(browser,10) KEYWORD = 'iPad' def index_page(page): """ 抓取索引頁 :param page: 頁碼 """ print('正在爬取第',page,'頁') try: url='https://s.taobao.com/search?q='+KEYWORD browser.get(url) if page>1: input1=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form > input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit'))) input1.clear() input1.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active > span'),str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item'))) get_products() time.sleep(2) except TimeoutException: index_page(page) from pyquery import PyQuery def get_products(): """ 提取商品資料 """ html=browser.page_source doc=PyQuery(html) items=doc('#mainsrp-itemlist .items .item').items() for item in items: product={ 'image':'http://'+item.find('.pic .img').attr('data-src'), 'price':item.find('strong').text(), 'deal':item.find('.deal-cnt').text(), 'title':item.find('[class="row row-2 title"] a').text(), 'shop':item.find('.shop a').text(), 'location':item.find('.location').text() } print(product) save_to_mysql(product) import pymysql db=pymysql.connect(host='localhost',user='root',password='123456789',port=3306,db='spiders') cursor=db.cursor() cursor.execute('create table taobao(image varchar(100),price varchar(20),deal varchar(20),title varchar(50),shop varchar(20),location varchar(20))') def save_to_mysql(product): try: cursor.execute('insert into taobao values(%s,%s,%s,%s,%s,%s)',(product['image'],product['price'],product['deal'],product['title'],product['shop'],product['location'])) db.commit() except: db.rollback() max_page=100 def main(): """ 遍歷每一頁 """ for i in range(1,max_page+1): index_page(i) main()