1. 程式人生 > >爬取京東資料

爬取京東資料

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as
EC from pyquery import PyQuery as pq # from config import * import pymongo import json import time from multiprocessing import Pool, Process from threading import Thread # MONGO_URL = 'localhost' # MONGO_DB = 'taobao' # MONGO_TABLE = 'product' SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'
,'--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'] # KEYWORD = '美食' # client = pymongo.MongoClient(MONGO_URL) # db = client[MONGO_DB] browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) # browser.implicitly_wait(15) # browser = webdriver.PhantomJS() wait = WebDriverWait(browser, 15) browser.
set_window_size(1400, 3000) x = 1 class Rate: def __init__(self): SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'] self.browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) self.wait = WebDriverWait(self.browser, 15) def get_good_rate(self, url): if len(url) > 52: return -1 self.browser.get(url) doc = pq(self.browser.page_source) if not (doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div')): print('no') browser.execute_script("window.scrollBy(0,6000)") time.sleep(2) else: print('yes') return doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text() rate = self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div')) ) return rate.text r = Rate() def search(): global x global r print('正在搜尋') try: browser.get('https://www.jd.com') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#key')) ) print('input') submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button'))) # submit = wait.until( # EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.s'))) input.send_keys('空氣淨化器') submit.click() wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)')) ) print('..') doc = pq(browser.page_source) # with open('page.txt', 'w', encoding='utf-8') as f: # f.write(doc.text()) items = doc('.gl-item') print(len(items)) data = [] for rank, item in enumerate(items): item = pq(item) print(x) product = { 'rank': x, 'price': item('.p-price i').text(), 'title': item('.p-name em').text(), 'comment_cnt': item('.p-commit>strong a').text(), 'comment_url': 'https:' + item('.p-commit>strong a').attr.href } product['brand'] = product['title'].split('\n')[0] good_rate = r.get_good_rate(product['comment_url']) product['good_rate'] = good_rate data.append(product) x += 1 with open('data.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False)) # for i in range(2,101): # next_page(i) except TimeoutException as e: return False def next_page(page_number): global x global r print('正在翻頁', page_number) try: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')) ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a'))) input.clear() input.send_keys(page_number) submit.click() wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)')) ) print('..') # browser.execute_script("window.scrollBy(0,10000)") # time.sleep(2) # wait.until( # EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(60)')) # ) doc = pq(browser.page_source) items = doc('.gl-item') print(len(items)) data = [] for rank, item in enumerate(items): item = pq(item) print(x) product = { 'rank': x, 'price': item('.p-price i').text(), 'title': item('.p-name em').text(), 'comment_cnt': item('.p-commit>strong a').text(), 'comment_url': 'https:' + item('.p-commit>strong a').attr.href } product['brand'] = product['title'].split('\n')[0] good_rate = r.get_good_rate(product['comment_url']) product['good_rate'] = good_rate data.append(product) x += 1 with open('data.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False)) except Exception as e: print(e) next_page(page_number) # def save_to_mongo(result): # try: # if db[MONGO_TABLE].insert(result): # print('儲存到MONGODB成功', result) # except Exception: # print('儲存到MONGODB失敗', result) def main(): try: total = search() total = int(re.compile(r'(\d+)').search(total).group(1)) for i in range(2, total + 1): next_page(i) except Exception as e: print('出錯啦') print(e) finally: browser.close() if __name__ == '__main__': # main() search() for i in range(2, 5): # time.sleep(1) t = Thread(target=next_page, args=(i,)) t.start() t.join() # next_page(i) # p = Process(target=next_page, args=(i,)) # p.start() # p.join() # pool = Pool() # pool.map(next_page, [i for i in range(2,101)]) # pool.close() # pool.join()