爬蟲03-京東資料採集
阿新 • • 發佈:2018-11-01
import time from selenium import webdriver from bs4 import BeautifulSoup url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0" driver = webdriver.Chrome() driver.implicitly_wait(3) driver.get(url) # 模擬下滑到底部操作 for i in range(1, 5): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 將載入好的頁面原始碼給bs4解析 soup = BeautifulSoup(driver.page_source, "html.parser") # 進行資訊的抽取(商品名稱,價格) p-name p-name-type-2 goods_info = soup.select(".gl-item") for info in goods_info: title = info.select(".p-name.p-name-type-2 a")[0].text.strip() price = info.select(".p-price")[0].text.strip() print(title) print(price) driver.close()
import requests from bs4 import BeautifulSoup import json def check(items): if len(items) == 0: return "No Public House" else: return items def got_html(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/69.0.3497.100 Safari/537.36'} url = 'https://search.jd.com/Search?keyword=Java&enc=utf-8' response = requests.get(url, headers=headers) html = response.content.decode() # print(html) return html def parse_html(html): soup = BeautifulSoup(html, 'lxml') item_list = soup.select('ul[class="gl-warp clearfix"] li') print(len(item_list)) result = [] for item in item_list: # css選擇器 選擇價格 data_sku = '.J_' + item.attrs['data-sku'] + ' i' # print(data_sku) price = item.select(data_sku)[0].get_text() result.append(price) # print(price) # 書名 name = item.select('div[class="p-name p-name-type-2"]')[0].get_text().strip() result.append(name) # print(name) # 評論數 data_id = '#J_comment_' + item.attrs['data-sku'] comments = item.select(data_id)[0].get_text().strip() result.append(comments) # print(comments) # 出版社 # J_goodsList > ul > li:nth-child(1) > div > div.p-shop > span > a public = item.select('div > div.p-shop > span > a')[0].get_text().strip() public = check(public) result.append(public) # print(public) return result def save_data(data): data = json.dumps(data, ensure_ascii=False) with open('Java_book.json', 'a', encoding='utf-8') as F: F.write(data) def main(): url_start = 'https://search.jd.com/Search?keyword=Java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=' url_end = '&s=58&click=0' for i in range(1, 11): url = url_start + str(i) + url_end html = got_html(url) details = parse_html(html) save_data(details)