爬取所有部落格
阿新 • • 發佈:2018-11-09
爬取所有部落格的內容並轉換成為pdf格式
from bs4 import BeautifulSoup import pdfkit import re # <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="">檢視</span></a> from gevent import os def getPagehtml(url): #獲取網頁的內容 response = requests.get(url) return response.text def createurl(text): #從網頁原始碼中匹配到每一片部落格網址 ''' <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="article-type type-1">原</span>爬取貓眼電影</a> :param text: :return: ''' pattern = r'<a href="(https://blog.csdn.net/qq_41911569/article/.*?)" target="_blank">' return re.findall(pattern,text) url = 'https://blog.csdn.net/qq_41911569' text = getPagehtml(url) createurl(text) def get_blog_content(i,url): #根據獲取到的每一片的部落格網址,獲得部落格的內容,並寫入檔案中 response = requests.get(url) soup = BeautifulSoup(response.text, 'html5lib') # 獲取head標籤的內容 head = soup.head # 獲取部落格標題 title = soup.find_all(class_="title-article")[0].get_text() # 獲取部落格內容 content = soup.find_all(class_="article_content")[0] # 寫入本地檔案 other = 'http://passport.csdn.net/account/login?from=' with open('/home/kiosk/Desktop/python筆記/python_stack/day26/bs/westos%d.html' %i, 'w') as f: f.write(str(head)) f.write('<h1>%s</h1>\n\n' %(title)) f.write(str(content)) def main(): # https://blog.csdn.net/qq_41911569/article/list/3 article_url = [] for i in range(3): url = 'https://blog.csdn.net/qq_41911569/article/list/%d' %(i+1) text = getPagehtml(url) article_url.append(createurl(text)) article_url = [j for i in article_url for j in i] # print(article_url) for i,v in enumerate(set(article_url)): get_blog_content(i,v) main()
結果: