爬取京東資料
阿新 • • 發佈:2018-11-26
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
# from config import *
import pymongo
import json
import time
from multiprocessing import Pool, Process
from threading import Thread
# MONGO_URL = 'localhost'
# MONGO_DB = 'taobao'
# MONGO_TABLE = 'product'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true' ,'--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']
# KEYWORD = '美食'
# client = pymongo.MongoClient(MONGO_URL)
# db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser.implicitly_wait(15)
# browser = webdriver.PhantomJS()
wait = WebDriverWait(browser, 15)
browser. set_window_size(1400, 3000)
x = 1
class Rate:
def __init__(self):
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']
self.browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
self.wait = WebDriverWait(self.browser, 15)
def get_good_rate(self, url):
if len(url) > 52:
return -1
self.browser.get(url)
doc = pq(self.browser.page_source)
if not (doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div')):
print('no')
browser.execute_script("window.scrollBy(0,6000)")
time.sleep(2)
else:
print('yes')
return doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()
rate = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div'))
)
return rate.text
r = Rate()
def search():
global x
global r
print('正在搜尋')
try:
browser.get('https://www.jd.com')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#key'))
)
print('input')
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button')))
# submit = wait.until(
# EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.s')))
input.send_keys('空氣淨化器')
submit.click()
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
)
print('..')
doc = pq(browser.page_source)
# with open('page.txt', 'w', encoding='utf-8') as f:
# f.write(doc.text())
items = doc('.gl-item')
print(len(items))
data = []
for rank, item in enumerate(items):
item = pq(item)
print(x)
product = {
'rank': x,
'price': item('.p-price i').text(),
'title': item('.p-name em').text(),
'comment_cnt': item('.p-commit>strong a').text(),
'comment_url': 'https:' + item('.p-commit>strong a').attr.href
}
product['brand'] = product['title'].split('\n')[0]
good_rate = r.get_good_rate(product['comment_url'])
product['good_rate'] = good_rate
data.append(product)
x += 1
with open('data.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
# for i in range(2,101):
# next_page(i)
except TimeoutException as e:
return False
def next_page(page_number):
global x
global r
print('正在翻頁', page_number)
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
)
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
input.clear()
input.send_keys(page_number)
submit.click()
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
)
print('..')
# browser.execute_script("window.scrollBy(0,10000)")
# time.sleep(2)
# wait.until(
# EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(60)'))
# )
doc = pq(browser.page_source)
items = doc('.gl-item')
print(len(items))
data = []
for rank, item in enumerate(items):
item = pq(item)
print(x)
product = {
'rank': x,
'price': item('.p-price i').text(),
'title': item('.p-name em').text(),
'comment_cnt': item('.p-commit>strong a').text(),
'comment_url': 'https:' + item('.p-commit>strong a').attr.href
}
product['brand'] = product['title'].split('\n')[0]
good_rate = r.get_good_rate(product['comment_url'])
product['good_rate'] = good_rate
data.append(product)
x += 1
with open('data.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
except Exception as e:
print(e)
next_page(page_number)
# def save_to_mongo(result):
# try:
# if db[MONGO_TABLE].insert(result):
# print('儲存到MONGODB成功', result)
# except Exception:
# print('儲存到MONGODB失敗', result)
def main():
try:
total = search()
total = int(re.compile(r'(\d+)').search(total).group(1))
for i in range(2, total + 1):
next_page(i)
except Exception as e:
print('出錯啦')
print(e)
finally:
browser.close()
if __name__ == '__main__':
# main()
search()
for i in range(2, 5):
# time.sleep(1)
t = Thread(target=next_page, args=(i,))
t.start()
t.join()
# next_page(i)
# p = Process(target=next_page, args=(i,))
# p.start()
# p.join()
# pool = Pool()
# pool.map(next_page, [i for i in range(2,101)])
# pool.close()
# pool.join()