1. 程式人生 > >將csdn的文章爬取,並將圖片儲存到本地

將csdn的文章爬取,並將圖片儲存到本地

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/11/13 10:20
# @Author  : jia.zhao
# @Desc    : 
# @File    : csdn_demo.py
# @Software: PyCharm


import requests
from lxml import etree
import pymysql
import re
import urllib.request
import urllib.error
import hashlib
from html import unescape
import ssl
from log_content import Logger

ssl._create_default_https_context = ssl._create_unverified_context
class CSDNSpider():

    def __init__(self):

        '''
        初始化
        '''
        self.url = 'https://blog.csdn.net/'
        self.start_requests()
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'blog.csdn.net',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }

    def start_requests(self):

        '''
        請求分類資料
        :return:
        '''
        xpath_nav = 'div[@class="container clearfix"]/nav[@class="clearfix"]/div[@class="clearfix"]/div[@class="nav_com"]/ul/li/a'
        # 請求url
        page = requests.get(self.url)
        # 使用xpath解析
        nav_html = etree.HTML(page.text)
        # 獲取分類的url和name
        nav_urls = nav_html.xpath(
            '//%s/@href' % xpath_nav)
        nav_titles = nav_html.xpath(
            '//%s//text()' % xpath_nav)
        for i in range(len(nav_urls)):
            # 判斷分類的url是不是區塊鏈的連結
            if nav_urls[i] == 'https://blockchain.csdn.net':
                nav_url = nav_urls[i]
            else:
                # 如果不是,url需要拼接
                nav_url = self.url + nav_urls[i]
            # 呼叫方法獲取部落格列表及內容
            self.content_article(nav_url, nav_titles[i])


    def content_article(self, nav_url, nav_title):


        # 不是區塊鏈部落格
        if nav_url != 'https://blockchain.csdn.net':
            # 根據分類獲得HTML標籤的class內容
            if nav_url.split('/')[-1] == '':
                ele = 'home'
            else:
                ele = nav_url.split('/')[-1]
            param = 'ul[@class="feedlist_mod %s"]/li[@class="clearfix"]/div[@class="list_con"]' % ele
            # 根據分類url請求文章列表
            page = requests.get(nav_url)
            blog_list_html = etree.HTML(page.text)
            if blog_list_html != '' and blog_list_html != None:
                # 部落格文章的url
                global blog_list_urls
                blog_list_urls = blog_list_html.xpath('//%s/div[@class="title"]/h2/a/@href' % param)
                # 部落格文章的name
                global blog_list_titles
                blog_list_titles = blog_list_html.xpath('//%s/div[@class="title"]/h2/a//text()' % param)
                # 作者
                global blog_list_authors
                blog_list_authors = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/dd[@class="name"]/a//text()' % param)
                # 部落格文章的閱讀數量
                global blog_list_read_num
                blog_list_read_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="read_num"]/a/span[@class="num"]//text()' % param)
                # 部落格文章的評論數量
                global blog_list_common_num
                blog_list_common_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="common_num "]/a/span[@class="num"]//text()' % param)
        elif nav_url == 'https://blockchain.csdn.net':
            page = requests.get(nav_url)
            blog_list_html = etree.HTML(page.text)
            param = 'div[@id="content"]/ul[@class="list"]/li[@class="zixun_img"]'
            param_2 = 'div[@class="cont"]/div[@class="fr right_cont"]/div[@class="bot_info"]'
            # 部落格文章的url
            blog_list_urls = blog_list_html.xpath('//%s/a/@href' % param)
            # 部落格文章的name
            blog_list_titles = blog_list_html.xpath('//%s/a//text()' % param)

            # 作者
            blog_list_authors = blog_list_html.xpath(
                '//%s/%s/a[@class="nick_name fl"]//text()' % (param, param_2))
            # 部落格文章的閱讀數量
            blog_list_read_num = blog_list_html.xpath(
                '//%s/%s/span[@class="num fr"]//text()' % (param, param_2))
            # 部落格文章的評論數量
            blog_list_common_num = blog_list_html.xpath('//%s/%s/span[@class="comment fr"]//text()' % (param, param_2))

        # 判斷獲取評論的值的個數和url是否一致
        if len(blog_list_urls) > len(blog_list_common_num):
            for i in range(len(blog_list_urls)-len(blog_list_common_num)):
                blog_list_common_num.append('0')
        if len(blog_list_urls) > len(blog_list_read_num):
            for i in range(len(blog_list_urls) - len(blog_list_read_num)):
                blog_list_read_num.append('0')

        # 迴圈取出文章的標題及其他內容
        for i in range(len(blog_list_titles)):
            # 訪問文章的url
            page = requests.get(blog_list_urls[i])
            content_html = etree.HTML(page.text)
            # 文章的內容
            article_list_contents = content_html.xpath(
                '//div[@class="blog-content-box"]/article/div[@class="article_content clearfix csdn-tracking-statistics"]/div[@class="htmledit_views"]')
            # 釋出時間
            publish_time = content_html.xpath(
                '//div[@class="blog-content-box"]/div[@class="article-header-box"]/div[@class="article-header"]/div[@class="article-info-box"]/div[@class="article-bar-top"]/span[@class="time"]//text()')
            # csdn有的頁面富文字標籤不一樣,有的文章是用markdown,有的是用富文字編輯器,所以標籤有區別
            if len(article_list_contents) == 0:
                article_list_contents = content_html.xpath(
                    '//div[@id="article_content"]/div[1]')
            if len(article_list_contents) == 0:
                log.logger.info('該文章裡沒有找到內容:' + blog_list_urls[i])
                continue
            # 取出內容
            for j in range(len(article_list_contents)):

                # 呼叫方法,儲存內容及圖片到檔案伺服器,返回路徑
                content_path = self.replace_content(article_list_contents[j], blog_list_urls[i])

                # 將其他資訊插入資料庫
                self.insert_mysql(str(blog_list_titles[i]).strip(), nav_title, blog_list_urls[i], str(blog_list_authors[i]).strip()
                                  , str(publish_time[0]).strip(),
                              blog_list_read_num[i], blog_list_common_num[i], content_path)
            if i == 11:
                break


    def insert_mysql(self, title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path):

        '''
        :param title:
        :param article_url:
        :param author:
        :param publish_time:
        :param read_num:
        :param comment_num:
        :param content_path:
        :return:
        '''
        conn = pymysql.connect(
            host='127.0.0.1',
            db='csdn',
            user='root',
            passwd='********',
            # charset='utf8',  # 編碼要加上,否則可能出現中文亂碼問題
            use_unicode=False)
        cursor = conn.cursor()

        sql = """insert into csdn_demo(title,nav_title, article_url, author, publish_time, read_num, comment_num, content_path) 
                          VALUES("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s");""" \
              % (title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path)

        try:
            cursor.execute(sql)
            conn.commit()
        except Exception as e:
            log.logger.warning('資料庫插入資料出現錯誤', e)

    def replace_content(self, content, article_url):

        '''

        :param content:
        :param article_url:
        :return: 檔案路徑
        '''

        reg = r'src="(.+?)"'
        content = etree.tostring(content).decode('utf8')
        content = unescape(content)
        imgre = re.compile(reg)
        imglist = re.findall(imgre, content)
        if len(imglist) != 0:
            # 迴圈
            for i in range(len(imglist)):
                # 將圖片的url轉為MD5 ,確保唯一
                img_name = hashlib.md5(imglist[i].encode('utf8'))
                # 轉換為16機制列印md5值
                img_name = img_name.hexdigest()
                # 解決下載不完全問題且避免陷入死迴圈
                try:
                    # 判斷圖片連結是否是正確的
                    if not imglist[i].endswith('.js') and imglist[i].startswith('http') and 'note.youdao.com' not in imglist[i]:
                        if imglist[i].endswith('.gif'):
                            global path_name
                            path_name = 'csdn/img/%s.gif' % img_name
                            urllib.request.urlretrieve(imglist[i], path_name)
                            content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.gif' % img_name)
                        else:
                            path_name = 'csdn/img/%s.jpg' % img_name
                            # 儲存圖片
                            urllib.request.urlretrieve(imglist[i], path_name)
                            # 根據每個圖片的src的內容進行替換
                            content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.jpg' % img_name)
                except Exception as e:
                    log.logger.warning('圖片下載出異常,進入異常處理模組,異常的連結:'+imglist[i]+'\n'+str(e))
                    if 'HTTP Error 403: Forbidden' in str(e):
                        log.logger.info('403錯誤,重新訪問')
                        res = requests.get(imglist[i], headers=self.headers, verify=False)
                        if res.status_code == 200:
                            log.logger.info('重新獲取成功')
                            with open(path_name, 'wb') as f:
                                f.write(res.content)
                            content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/%s' % path_name)
                        else:
                            log.logger.error('重新獲取失敗,已記錄'+imglist[i]+'文章url是:'+article_url, )
                            continue
                    else:
                        continue
                except urllib.error as e:
                    log.logger.warning('圖片地址錯誤,無響應', e)
                    continue



        # 將文章的url轉為MD5,作為檔名,確保唯一
        article_name_md5 = hashlib.md5(article_url.encode('utf8'))
        # 轉換為16機制列印md5值
        article_name_md5 = article_name_md5.hexdigest()
        file_name = '/home/zhaojia/csdn_demo/csdn/article/'+article_name_md5+'.txt'
        # 寫入檔案
        with open(file_name, 'w', encoding='utf8') as f:
            f.write(content)
        return file_name

if __name__ == '__main__':

    log = Logger('csdn_all.log', level='info')
    CSDNSpider()

程式碼的註釋很詳細,我就不過多廢話

log模組

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/11/16 11:20
# @Author  : jia.zhao
# @Desc    : 
# @File    : log_content.py
# @Software: PyCharm

import logging
from logging import handlers

class Logger(object):
    level_relations = {
        'debug':logging.DEBUG,
        'info':logging.INFO,
        'warning':logging.WARNING,
        'error':logging.ERROR,
        'crit':logging.CRITICAL
    }#日誌級別關係對映

    def __init__(self,filename,level='info',when='D',backCount=3,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
        self.logger = logging.getLogger(filename)
        format_str = logging.Formatter(fmt)#設定日誌格式
        self.logger.setLevel(self.level_relations.get(level))#設定日誌級別
        sh = logging.StreamHandler()#往螢幕上輸出
        sh.setFormatter(format_str) #設定螢幕上顯示的格式
        th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')#往檔案裡寫入#指定間隔時間自動生成檔案的處理器
        #例項化TimedRotatingFileHandler
        #interval是時間間隔,backupCount是備份檔案的個數,如果超過這個個數,就會自動刪除,when是間隔的時間單位,單位有以下幾種:
        # S 秒
        # M 分
        # H 小時、
        # D 天、
        # W 每星期(interval==0時代表星期一)
        # midnight 每天凌晨
        th.setFormatter(format_str)#設定檔案裡寫入的格式
        self.logger.addHandler(sh) #把物件加到logger裡
        self.logger.addHandler(th)
# if __name__ == '__main__':
#     log = Logger('all.log',level='info')
#     log.logger.info('info')
#     log.logger.warning('警告')
#     log.logger.error('報錯')
#     log.logger.critical('嚴重')