通過分析ajax,使用正則表示式爬取今日頭條
阿新 • • 發佈:2018-12-11
今日頭條是一個動態載入頁面的網站,這一類的網站直接使用requests爬取的話得不到我們想要的內容。所以一般這類的網站都是通過分析ajax來進行抓包來獲取我們想要的內容。
老規矩,首先列出需要引入的庫:
import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError from config import *
通過分析網頁的netword獲取引數資料,然後使用urlencode編碼獲得想要的url,然後對此url發出請求:
data = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } params = urlencode(data) base = 'http://www.toutiao.com/search_content/' url = base + '?' + params
然後解析網頁獲取圖片,即是我們想要的內容,這裡使用的是re正則表示式來進行匹配,通過分析網頁程式碼寫出正則表示式:
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
最後把他們放入到一個完整的函式,即可獲得我們想要的內容,具體程式碼如下:
import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError from config import * #mongoDB儲存配置 client = pymongo.MongoClient(MONGO_URL, connect=False) db = client[MONGO_DB] #獲得想要的url傳送請求 def get_page_index(offset, keyword): #分析網頁獲取引數 data = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } params = urlencode(data) base = 'http://www.toutiao.com/search_content/' #拼接成url url = base + '?' + params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return None #圖片下載 def download_image(url): print('Downloading', url) try: response = requests.get(url) if response.status_code == 200: #這裡呼叫的是圖片儲存方法 save_image(response.content) return None except ConnectionError: return None #圖片的儲存 def save_image(content): #檔案的儲存路徑和命名,這裡儲存到當前的資料夾,使用hash來命名 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') print(file_path) #去重 if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() #解析引導頁面,依據json資料格式提取內容 def parse_page_index(text): try: data = json.loads(text) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: pass #獲取詳情頁 def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return None #解析詳情頁並下載圖片 def parse_page_detail(html, url): soup = BeautifulSoup(html, 'lxml') result = soup.select('title') title = result[0].get_text() if result else '' #正則表示式匹配到圖片 images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1).replace('\\', '')) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title': title, 'url': url, 'images': images } #儲存帶mongoDB資料庫中 def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('Successfully Saved to Mongo', result) return True return False def main(offset): text = get_page_index(offset, KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) result = parse_page_detail(html, url) if result: save_to_mongo(result) if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()