抓取csdn部落格的所有文章url
阿新 • • 發佈:2019-02-08
輸入部落格的主域名,就可以抓取這個部落格所有的文章編號。
# -*- coding:utf8 -*- import string import urllib2 import re import time import random class CSDN_Spider: def __init__(self,url): self.myUrl = url self.datas = [] print u"csdn爬蟲已啟動...." def csdn(self): url = self.myUrl + "?viewmode=list" user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] agent = random.choice(user_agents) req = urllib2.Request(url) req.add_header('User-Agent', agent) req.add_header('Host', 'blog.csdn.net') req.add_header('Accept', '*/*') req.add_header('Referer', 'http://blog.csdn.net/djd1234567?viewmode=contents') req.add_header('GET', url) mypage = urllib2.urlopen(req).read().decode("utf8") #print mypage Pagenum = self.page_counter(mypage) #print Pagenum self.find_data(self.myUrl,Pagenum) def page_counter(self,mypage):#<a href="/djd1234567/article/list/11">尾頁</a> myMatch = re.search(u'/article/list/(\d+?)">尾頁</a>',mypage,re.S) if myMatch: Pagenum = int(myMatch.group(1)) print u"爬蟲報告:發現目錄一共%d頁" %Pagenum else: Pagenum = 0 print u"爬蟲報告:沒找到頁面的數量" return Pagenum def find_data(self,myurl,Pagenum): name = myurl.split("/") f = open(name[-1] + '.txt','w+') for i in range(1,Pagenum+1): print i print u"爬蟲報告:第%d頁正在載入中......" % i url = myurl + "/article/list/" + str(i) user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] agent = random.choice(user_agents) req = urllib2.Request(url) req.add_header('User-Agent', agent) req.add_header('Host', 'blog.csdn.net') req.add_header('Accept', '*/*') req.add_header('Referer', url) req.add_header('GET', url) mypage = urllib2.urlopen(req).read() myItems = re.findall(u'"><a href="/' + myurl.split("/")[-1] + '/article/details/(\d+?)" title="',mypage,re.S) #print myItems for item in myItems: self.datas.append(item+"\n") #time.sleep(1) f.writelines(self.datas) f.close() print self.datas print u"爬蟲報告:檔案已下載到本地並打包成txt格式檔案" url = "http://blog.csdn.net/djd1234567" mySpider = CSDN_Spider(url) mySpider.csdn()