將csdn的文章爬取,並將圖片儲存到本地
阿新 • • 發佈:2018-12-21
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/11/13 10:20 # @Author : jia.zhao # @Desc : # @File : csdn_demo.py # @Software: PyCharm import requests from lxml import etree import pymysql import re import urllib.request import urllib.error import hashlib from html import unescape import ssl from log_content import Logger ssl._create_default_https_context = ssl._create_unverified_context class CSDNSpider(): def __init__(self): ''' 初始化 ''' self.url = 'https://blog.csdn.net/' self.start_requests() self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'blog.csdn.net', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } def start_requests(self): ''' 請求分類資料 :return: ''' xpath_nav = 'div[@class="container clearfix"]/nav[@class="clearfix"]/div[@class="clearfix"]/div[@class="nav_com"]/ul/li/a' # 請求url page = requests.get(self.url) # 使用xpath解析 nav_html = etree.HTML(page.text) # 獲取分類的url和name nav_urls = nav_html.xpath( '//%s/@href' % xpath_nav) nav_titles = nav_html.xpath( '//%s//text()' % xpath_nav) for i in range(len(nav_urls)): # 判斷分類的url是不是區塊鏈的連結 if nav_urls[i] == 'https://blockchain.csdn.net': nav_url = nav_urls[i] else: # 如果不是,url需要拼接 nav_url = self.url + nav_urls[i] # 呼叫方法獲取部落格列表及內容 self.content_article(nav_url, nav_titles[i]) def content_article(self, nav_url, nav_title): # 不是區塊鏈部落格 if nav_url != 'https://blockchain.csdn.net': # 根據分類獲得HTML標籤的class內容 if nav_url.split('/')[-1] == '': ele = 'home' else: ele = nav_url.split('/')[-1] param = 'ul[@class="feedlist_mod %s"]/li[@class="clearfix"]/div[@class="list_con"]' % ele # 根據分類url請求文章列表 page = requests.get(nav_url) blog_list_html = etree.HTML(page.text) if blog_list_html != '' and blog_list_html != None: # 部落格文章的url global blog_list_urls blog_list_urls = blog_list_html.xpath('//%s/div[@class="title"]/h2/a/@href' % param) # 部落格文章的name global blog_list_titles blog_list_titles = blog_list_html.xpath('//%s/div[@class="title"]/h2/a//text()' % param) # 作者 global blog_list_authors blog_list_authors = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/dd[@class="name"]/a//text()' % param) # 部落格文章的閱讀數量 global blog_list_read_num blog_list_read_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="read_num"]/a/span[@class="num"]//text()' % param) # 部落格文章的評論數量 global blog_list_common_num blog_list_common_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="common_num "]/a/span[@class="num"]//text()' % param) elif nav_url == 'https://blockchain.csdn.net': page = requests.get(nav_url) blog_list_html = etree.HTML(page.text) param = 'div[@id="content"]/ul[@class="list"]/li[@class="zixun_img"]' param_2 = 'div[@class="cont"]/div[@class="fr right_cont"]/div[@class="bot_info"]' # 部落格文章的url blog_list_urls = blog_list_html.xpath('//%s/a/@href' % param) # 部落格文章的name blog_list_titles = blog_list_html.xpath('//%s/a//text()' % param) # 作者 blog_list_authors = blog_list_html.xpath( '//%s/%s/a[@class="nick_name fl"]//text()' % (param, param_2)) # 部落格文章的閱讀數量 blog_list_read_num = blog_list_html.xpath( '//%s/%s/span[@class="num fr"]//text()' % (param, param_2)) # 部落格文章的評論數量 blog_list_common_num = blog_list_html.xpath('//%s/%s/span[@class="comment fr"]//text()' % (param, param_2)) # 判斷獲取評論的值的個數和url是否一致 if len(blog_list_urls) > len(blog_list_common_num): for i in range(len(blog_list_urls)-len(blog_list_common_num)): blog_list_common_num.append('0') if len(blog_list_urls) > len(blog_list_read_num): for i in range(len(blog_list_urls) - len(blog_list_read_num)): blog_list_read_num.append('0') # 迴圈取出文章的標題及其他內容 for i in range(len(blog_list_titles)): # 訪問文章的url page = requests.get(blog_list_urls[i]) content_html = etree.HTML(page.text) # 文章的內容 article_list_contents = content_html.xpath( '//div[@class="blog-content-box"]/article/div[@class="article_content clearfix csdn-tracking-statistics"]/div[@class="htmledit_views"]') # 釋出時間 publish_time = content_html.xpath( '//div[@class="blog-content-box"]/div[@class="article-header-box"]/div[@class="article-header"]/div[@class="article-info-box"]/div[@class="article-bar-top"]/span[@class="time"]//text()') # csdn有的頁面富文字標籤不一樣,有的文章是用markdown,有的是用富文字編輯器,所以標籤有區別 if len(article_list_contents) == 0: article_list_contents = content_html.xpath( '//div[@id="article_content"]/div[1]') if len(article_list_contents) == 0: log.logger.info('該文章裡沒有找到內容:' + blog_list_urls[i]) continue # 取出內容 for j in range(len(article_list_contents)): # 呼叫方法,儲存內容及圖片到檔案伺服器,返回路徑 content_path = self.replace_content(article_list_contents[j], blog_list_urls[i]) # 將其他資訊插入資料庫 self.insert_mysql(str(blog_list_titles[i]).strip(), nav_title, blog_list_urls[i], str(blog_list_authors[i]).strip() , str(publish_time[0]).strip(), blog_list_read_num[i], blog_list_common_num[i], content_path) if i == 11: break def insert_mysql(self, title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path): ''' :param title: :param article_url: :param author: :param publish_time: :param read_num: :param comment_num: :param content_path: :return: ''' conn = pymysql.connect( host='127.0.0.1', db='csdn', user='root', passwd='********', # charset='utf8', # 編碼要加上,否則可能出現中文亂碼問題 use_unicode=False) cursor = conn.cursor() sql = """insert into csdn_demo(title,nav_title, article_url, author, publish_time, read_num, comment_num, content_path) VALUES("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s");""" \ % (title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path) try: cursor.execute(sql) conn.commit() except Exception as e: log.logger.warning('資料庫插入資料出現錯誤', e) def replace_content(self, content, article_url): ''' :param content: :param article_url: :return: 檔案路徑 ''' reg = r'src="(.+?)"' content = etree.tostring(content).decode('utf8') content = unescape(content) imgre = re.compile(reg) imglist = re.findall(imgre, content) if len(imglist) != 0: # 迴圈 for i in range(len(imglist)): # 將圖片的url轉為MD5 ,確保唯一 img_name = hashlib.md5(imglist[i].encode('utf8')) # 轉換為16機制列印md5值 img_name = img_name.hexdigest() # 解決下載不完全問題且避免陷入死迴圈 try: # 判斷圖片連結是否是正確的 if not imglist[i].endswith('.js') and imglist[i].startswith('http') and 'note.youdao.com' not in imglist[i]: if imglist[i].endswith('.gif'): global path_name path_name = 'csdn/img/%s.gif' % img_name urllib.request.urlretrieve(imglist[i], path_name) content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.gif' % img_name) else: path_name = 'csdn/img/%s.jpg' % img_name # 儲存圖片 urllib.request.urlretrieve(imglist[i], path_name) # 根據每個圖片的src的內容進行替換 content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.jpg' % img_name) except Exception as e: log.logger.warning('圖片下載出異常,進入異常處理模組,異常的連結:'+imglist[i]+'\n'+str(e)) if 'HTTP Error 403: Forbidden' in str(e): log.logger.info('403錯誤,重新訪問') res = requests.get(imglist[i], headers=self.headers, verify=False) if res.status_code == 200: log.logger.info('重新獲取成功') with open(path_name, 'wb') as f: f.write(res.content) content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/%s' % path_name) else: log.logger.error('重新獲取失敗,已記錄'+imglist[i]+'文章url是:'+article_url, ) continue else: continue except urllib.error as e: log.logger.warning('圖片地址錯誤,無響應', e) continue # 將文章的url轉為MD5,作為檔名,確保唯一 article_name_md5 = hashlib.md5(article_url.encode('utf8')) # 轉換為16機制列印md5值 article_name_md5 = article_name_md5.hexdigest() file_name = '/home/zhaojia/csdn_demo/csdn/article/'+article_name_md5+'.txt' # 寫入檔案 with open(file_name, 'w', encoding='utf8') as f: f.write(content) return file_name if __name__ == '__main__': log = Logger('csdn_all.log', level='info') CSDNSpider()
程式碼的註釋很詳細,我就不過多廢話
log模組
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/11/16 11:20 # @Author : jia.zhao # @Desc : # @File : log_content.py # @Software: PyCharm import logging from logging import handlers class Logger(object): level_relations = { 'debug':logging.DEBUG, 'info':logging.INFO, 'warning':logging.WARNING, 'error':logging.ERROR, 'crit':logging.CRITICAL }#日誌級別關係對映 def __init__(self,filename,level='info',when='D',backCount=3,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'): self.logger = logging.getLogger(filename) format_str = logging.Formatter(fmt)#設定日誌格式 self.logger.setLevel(self.level_relations.get(level))#設定日誌級別 sh = logging.StreamHandler()#往螢幕上輸出 sh.setFormatter(format_str) #設定螢幕上顯示的格式 th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')#往檔案裡寫入#指定間隔時間自動生成檔案的處理器 #例項化TimedRotatingFileHandler #interval是時間間隔,backupCount是備份檔案的個數,如果超過這個個數,就會自動刪除,when是間隔的時間單位,單位有以下幾種: # S 秒 # M 分 # H 小時、 # D 天、 # W 每星期(interval==0時代表星期一) # midnight 每天凌晨 th.setFormatter(format_str)#設定檔案裡寫入的格式 self.logger.addHandler(sh) #把物件加到logger裡 self.logger.addHandler(th) # if __name__ == '__main__': # log = Logger('all.log',level='info') # log.logger.info('info') # log.logger.warning('警告') # log.logger.error('報錯') # log.logger.critical('嚴重')