python_爬取部落格文章下載到本地
阿新 • • 發佈:2019-02-18
http://blog.csdn.net/hpu_a/article/details/51518990學習python一段時間了,爬兩個網頁練練手,該原始碼是爬取韓寒部落格的所有文章,並將文章連結下載到本地,關於將部落格純文字下載到本地見博主另外一篇文章: # -*- coding: utf-8 -*- import urllib page=1 url = [' '] *350 i = 1 while page <=7: menu = "http://blog.sina.com.cn/s/articlelist_1191258123_0_"+str(page)+".html" print menu conn = urllib.urlopen(menu).read() #讀取部落格首頁 #print conn title = conn.find(r'<a title=')#找到文章標題 href = conn.find(r'href=',title)#獲取連結的開始序號,相對位置 html = conn.find(r'.html',href)#獲取連結的結束序號 while i<= 320 and title != -1 and href != -1 and html != -1: url[i]= conn[href+6:html+5]#列表加減獲取連結地址 print "第"+str(i)+"篇文章的地址是:" + url[i] file = url[i] #下載文章 content = urllib.urlopen(file).read()#讀取該連結的文章內容 filename = file[26:] print filename + "已下載" open(filename, 'w').write(content) #下載文章到本地,預設當前目錄 i += 1 title = conn.find(r'<a title' ,html) href = conn.find(r'href=',title)#獲取連結的開始序號,相對位置 html = conn.find(r'.html',href)#獲取連結的結束序號 page += 1 else: print "-------THE END!-----------"