1. 程式人生 > >python AjaxSpider 代碼演示

python AjaxSpider 代碼演示

orm art pymongo ces insert numbers == cli script



import re # 引入正則表達式
import json #  引入 json
import pymongo # 引入mongo數據庫
import requests # 引入HTTP請求協議
from hashlib import md5 # 引入MD5
from bs4 import BeautifulSoup #引入BeautifulSoup 信息查詢框架
from multiprocessing import Pool # 引入 多線程池
from urllib.parse import urlencode #引入網頁解析
from json.decoder import JSONDecodeError #引入json錯誤異常
from requests.exceptions import RequestException #引入 HTTP異常

from config import * #導入數據庫配置信息

client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]

# 抓取索引
def get_page_index(offset,keyword):
    # 構造請求數據信息
    data ={
        ‘office‘:offset, # 默認頁碼
        ‘format‘: ‘json‘, # 數據格式
        ‘keyword‘: ‘keyword‘, # 關鍵字
        ‘autoload‘: ‘true‘,
        ‘count‘: ‘20‘,
        ‘cur_tab‘: 3,
    }
    url = ‘http://www.toutiao.com/search_content/?‘ + urlencode(data)
    try:
        response = requests.get(url)
        # 判斷是否有正常獲取到網頁信息
        if response.status_code == 200:
            # 如果訪問正常澤返回數據,否則為空
            return response.text
        return None
    except RequestException:
        print(‘請求索引出錯‘)
        return None

def parse_page_index(html):
   try:
        data = json.loads(html)
        if data and ‘data‘ in data.keys():
            for item in data.get(‘data‘):
                yield item.get(‘article_url‘)
   except JSONDecodeError:
       pass

def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
                return response.text
        return None
    except RequestException:
        print(‘請求詳情頁出錯‘,url)
        print(url)


def parse_page_detail(html,url):
    soup = BeautifulSoup(html,‘lxml‘)
    title = soup.select(‘title‘)[0].get_text()
    print(title)
    images_pattern = re.compile(‘var gallery = (.*?)‘,re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1))
        if data and ‘sub_images‘ in data.keys():
            sub_images = data.get(‘sub_images‘)
            images = [item.get(‘url‘) for item in sub_images]
            for image in images: download_image(image)
            return {
                ‘title‘:title,
                ‘url‘:url,
                ‘images‘:images,

            }

def save_to_monogo(result):
    if db[MONGO_TABLE].insert(result):
        print(‘存儲到MonogoDB成功‘,result)
        return True
    return False

def download_image(url):
    print(‘正在下載‘,url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
                # return response.text
            save_image(response.content)
        return None
    except RequestException:
        print(‘請求圖片出錯出錯‘,url)
        return None

def save_image(content):
    file_path = ‘{0}/{1}.{2}‘.format(ls.getcwd(),md5(content).hexdigest(),‘jpg‘)
    if not os.path.exists(file_path):
        with open(file_path,‘wb‘) as f:
            f.writable(content)
            f.close()


def main(offset):
    # html = get_page_index(0,‘街拍‘)
    html = get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
           result = parse_page_detail(html,url)
           if result: save_to_monogo(result)

           print(result)
if __name__ == ‘__main__‘:
    # main()
    groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)


python AjaxSpider 代碼演示