python AjaxSpider 代碼演示
阿新 • • 發佈:2018-10-21
orm art pymongo ces insert numbers == cli script
import re # 引入正則表達式 import json # 引入 json import pymongo # 引入mongo數據庫 import requests # 引入HTTP請求協議 from hashlib import md5 # 引入MD5 from bs4 import BeautifulSoup #引入BeautifulSoup 信息查詢框架 from multiprocessing import Pool # 引入 多線程池 from urllib.parse import urlencode #引入網頁解析 from json.decoder import JSONDecodeError #引入json錯誤異常 from requests.exceptions import RequestException #引入 HTTP異常 from config import * #導入數據庫配置信息 client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] # 抓取索引 def get_page_index(offset,keyword): # 構造請求數據信息 data ={ ‘office‘:offset, # 默認頁碼 ‘format‘: ‘json‘, # 數據格式 ‘keyword‘: ‘keyword‘, # 關鍵字 ‘autoload‘: ‘true‘, ‘count‘: ‘20‘, ‘cur_tab‘: 3, } url = ‘http://www.toutiao.com/search_content/?‘ + urlencode(data) try: response = requests.get(url) # 判斷是否有正常獲取到網頁信息 if response.status_code == 200: # 如果訪問正常澤返回數據,否則為空 return response.text return None except RequestException: print(‘請求索引出錯‘) return None def parse_page_index(html): try: data = json.loads(html) if data and ‘data‘ in data.keys(): for item in data.get(‘data‘): yield item.get(‘article_url‘) except JSONDecodeError: pass def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: print(‘請求詳情頁出錯‘,url) print(url) def parse_page_detail(html,url): soup = BeautifulSoup(html,‘lxml‘) title = soup.select(‘title‘)[0].get_text() print(title) images_pattern = re.compile(‘var gallery = (.*?)‘,re.S) result = re.search(images_pattern,html) if result: data = json.loads(result.group(1)) if data and ‘sub_images‘ in data.keys(): sub_images = data.get(‘sub_images‘) images = [item.get(‘url‘) for item in sub_images] for image in images: download_image(image) return { ‘title‘:title, ‘url‘:url, ‘images‘:images, } def save_to_monogo(result): if db[MONGO_TABLE].insert(result): print(‘存儲到MonogoDB成功‘,result) return True return False def download_image(url): print(‘正在下載‘,url) try: response = requests.get(url) if response.status_code == 200: # return response.text save_image(response.content) return None except RequestException: print(‘請求圖片出錯出錯‘,url) return None def save_image(content): file_path = ‘{0}/{1}.{2}‘.format(ls.getcwd(),md5(content).hexdigest(),‘jpg‘) if not os.path.exists(file_path): with open(file_path,‘wb‘) as f: f.writable(content) f.close() def main(offset): # html = get_page_index(0,‘街拍‘) html = get_page_index(offset,KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_detail(html,url) if result: save_to_monogo(result) print(result) if __name__ == ‘__main__‘: # main() groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)] pool = Pool() pool.map(main, groups)
python AjaxSpider 代碼演示