python 網站爬蟲下載線上盜墓筆記小說到本地的指令碼

阿新 • • 發佈：2019-02-20

最近閒著沒事想看小說，找到一個全是南派三叔的小說的網站，決定都下載下來看看，於是動手，在很多QQ群裡高手的幫助下（本人正則表示式很爛，程式複雜的正則都是一些高手指導的），花了三四天寫了一個指令碼

需要 BeautifulSoup 和 requests 兩個庫

（我已經把註釋寫得儘量詳細）

這個程式的執行速度很慢，求高手告訴我優化的方法！！

#-*-coding:utf8-*-

from bs4 import BeautifulSoup
import requests
import re
import os


#開啟網頁將所需的URL讀取出來放進一個列表中
r = requests.get('http://www.nanpaisanshu.org/').content  #開啟要讀取的網頁
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #在網頁中找到需要的資訊

sc = str(content) #轉換為string型別

lists=[]
lists = sc.split(',')   
lists = list(set(lists)) #刪除列表中重複資訊

lisy=[]


for line in lists:
    p=line.split('"')[1]  #按 " 分割，取出需要的資訊寫進陣列
    lisy.append(p)        #這裡已經擁有需要的url
    #print p
#print lisy


#把讀取到的URL遍歷開啟，將所有網頁儲存到html檔案中

s = os.getcwd()#當前路徑

d = os.sep  #系統分隔符

namef='aaa' #檔案加名稱

#b = os.path.exists( s+d+namef) #判斷是存在

f=os.path.exists(s+d+namef) #判斷是存在

if f==False:
    os.mkdir(s+d+namef)  #如果資料夾不存在就新建一個
else:
    print u'已經存在'+namef

filenm = s+d+namef+d    #路徑

i=1
for line in lisy:
    r = requests.get(line)   #遍歷開啟所有url
    print r.content
    print '\n'
    tfile=open(filenm+'neirong'+str(i)+'.html','w')
    i=i+1
    tfile.write(r.content) #將網頁內容寫入檔案

#將URL檔案中的符合要求的URL讀取出來寫進一個txt檔案中
for i in range(1,len(lisy)+1):
    fp = open(filenm+'neirong'+str(i)+'.html', "r")
    of = open(filenm+'neirong'+str(i)+'.txt','w')  
    content = fp.read()   #將檔案內容讀取

    p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #正則匹配
    
    #print p.findall(content)

    #print type(p.findall(content))

    for line in p.findall(content):  
        #print line+'\n'
        #if line !='http://www.nanpaisanshu.org/9701.html':
        of.write(line+'\n')  #將匹配到的檔案寫入另一個檔案中
        #else:
            #continue

        #of.write(str(p.findall(content)))

#關閉檔案
of.close()
fp.close()
tfile.close()


#將txt

for i in range(1,len(lisy)+1):
    ot=open(filenm+'neirong'+str(i)+'.txt','r')
    outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')


    li=[]
    for line in ot:
        line = line.replace('\n','')
        li.append(line)   #將url檔案中的資料放進列表中

    li = sorted(li)  #給列表排序

    for line in li:
        print line
        #line = line.replace('\n','')
        r = requests.get(line).content  #遍歷開啟所有url
        title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2   #取出標題
        content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #取出內容
        sti=str(title).replace('<h2>','').replace('</h2>','')  #處理標題，只保留文字

        #處理內容，只保留文字
        scon = str(content).replace('<p>','  ').replace('</p>','  ').replace('<br/>','\n')
        #print str(urllist)
        scon = re.sub("<.*>", "", scon)
        scon = re.sub("(.*?);","",scon) 
        #scon = scon.strip()
        scon = '\n'.join(scon.split())

        print scon
        outfile.write(sti+'\n'+line+'\n'+scon) #將標題和內容寫進檔案中
    #i=i+1
    #print 
#print urllist

print '=========================下載結束======================='


#關閉檔案
outfile.close()
ot.close()



#取出指定資料夾下的所有檔名
targetDir=s+d+namef
for line in os.listdir(targetDir):

    p=re.compile(r'neirong[0-9]{1}') #用正則匹配
    if p.match(line)!=None:
        print "需要刪除的檔案"+s+d+namef+d+line+'!!'
        os.remove(s+d+namef+d+line)  #匹配成功就刪除這個檔案，os.remove()中需要完整路徑
    else:
        print '保留檔案！'
        continue

有時候會顯示連線失敗，然後程式報錯，應該判斷一下 requests.get(url).status_code != 200 不過我加了以後發現執行更慢，每個頁面都判斷，汗，可能是因為我這裡網速幾K的原因才會異常

下面是修改後的完善版，慎用，速度極其的慢，判斷的東西和次數增加的緣故：

#-*-coding:utf8-*-

#下載盜墓筆記小說
#2014-10-14
#ZJL

from bs4 import BeautifulSoup
import requests
import re
import os


#開啟網頁將所需的URL讀取出來放進一個列表中
r = requests.get('http://www.nanpaisanshu.org/').content  #開啟要讀取的網頁
content=BeautifulSoup(r).findAll('a',href=re.compile(r'\Ahttp://www.nanpaisanshu.org/[a-z]+\Z')) #在網頁中找到需要的資訊

sc = str(content) #轉換為string型別

lists=[]
lists = sc.split(',')   
lists = list(set(lists)) #刪除列表中重複資訊

lisy=[]


for line in lists:
    p=line.split('"')[1]  #按 " 分割，取出需要的資訊寫進陣列
    lisy.append(p)        #這裡已經擁有需要的url
    #print p
#print lisy


#把讀取到的URL遍歷開啟，將所有網頁儲存到html檔案中

s = os.getcwd()#當前路徑

d = os.sep  #系統分隔符

namef='aaa' #檔案加名稱

#b = os.path.exists( s+d+namef) #判斷是存在

f=os.path.exists(s+d+namef) #判斷是存在

if f==False:
    os.mkdir(s+d+namef)  #如果資料夾不存在就新建一個
else:
    print u'已經存在'+namef

filenm = s+d+namef+d    #路徑

i=1
for line in lisy:
    r = requests.get(line)   #遍歷開啟所有url
    print r.content
    print '\n'
    tfile=open(filenm+'neirong'+str(i)+'.html','w')
    i=i+1
    tfile.write(r.content) #將網頁內容寫入檔案

#將URL檔案中的符合要求的URL讀取出來寫進一個txt檔案中
for i in range(1,len(lisy)+1):
    fp = open(filenm+'neirong'+str(i)+'.html', "r")
    of = open(filenm+'neirong'+str(i)+'.txt','w')  
    content = fp.read()   #將檔案內容讀取

    p=re.compile(r'http://www\.nanpaisanshu\.org/.*?\.html') #正則匹配
    
    #print p.findall(content)

    #print type(p.findall(content))

    for line in p.findall(content):  
        #print line+'\n'
        #if line !='http://www.nanpaisanshu.org/9701.html':
        of.write(line+'\n')  #將匹配到的檔案寫入另一個檔案中
        #else:
            #continue

        #of.write(str(p.findall(content)))

#關閉檔案
of.close()
fp.close()
tfile.close()


#將txt

for i in range(1,len(lisy)+1):
    ot=open(filenm+'neirong'+str(i)+'.txt','r')
    if os.path.exists(filenm+'quanbu'+str(i)+'.txt')==True:
        print "已經存在"+filenm+'quanbu'+str(i)+'.txt'+'會先刪除再建立'
        os.remove(filenm+'quanbu'+str(i)+'.txt')
        outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')   #防止第二次下載時內容在檔案後面追加(不知道會不會重新新建檔案覆蓋掉原來的檔案所以這麼做)

    else:
        print "新建"+filenm+'quanbu'+str(i)+'.txt'
        outfile=open(filenm+'quanbu'+str(i)+'.txt','a+')


    
    li=[]
    for line in ot:
        line = line.replace('\n','')
        li.append(line)   #將url檔案中的資料放進列表中

    li = sorted(li)  #給列表排序

    for line in li:
    #print line
        #line = line.replace('\n','')

        if requests.get(line).status_code != 200:
            print '因為網路原因，這個章節為空!'
            outfile.write('因為網路原因，這個章節為空')   #判斷網路連線情況，防止連線失敗後程序報錯
        elif requests.get(line).status_code == 200:
            print '連線成功！'
            r = requests.get(line).content  #遍歷開啟所有url
            title=BeautifulSoup(r).find("div",{'class':"post_title"}).h2   #取出標題
            content=BeautifulSoup(r).findAll("div",{'class':"post_entry"}) #取出內容
            sti=str(title).replace('<h2>','').replace('</h2>','')  #處理標題，只保留文字

            #處理內容，只保留文字
            scon = str(content).replace('<p>','  ').replace('</p>','  ').replace('<br/>','\n')
            #print str(urllist)
            scon = re.sub("<.*>", "", scon)
            scon = re.sub("(.*?);","",scon) 
            #scon = scon.strip()
            scon = '\n'.join(scon.split())

            print scon
            outfile.write(sti+'\n'+line+'\n'+scon) #將標題，連結，內容寫進檔案中
        #i=i+1
        #print 
        #print urllist

print '=========================下載結束======================='


#關閉檔案
outfile.close()
ot.close()



#取出指定資料夾下的所有檔名
targetDir=s+d+namef
for line in os.listdir(targetDir):

    p=re.compile(r'neirong[0-9]{1}') #用正則匹配
    if p.match(line)!=None:
        print "需要刪除的檔案"+s+d+namef+d+line+'!!'
        os.remove(s+d+namef+d+line)  #匹配成功就刪除這個檔案，os.remove()中需要完整路徑
    else:
        print '保留檔案！'
        continue

python 網站爬蟲下載線上盜墓筆記小說到本地的指令碼

最近閒著沒事想看小說，找到一個全是南派三叔的小說的網站，決定都下載下來看看，於是動手，在很多QQ群裡高手的幫助下（本人正則表示式很爛，程式複雜的正則都是一些高手指導的），花了三四天寫了一個指令碼需要 BeautifulSoup 和 requests 兩個庫（我已經把註釋

python爬蟲-爬取盜墓筆記

本來今天要繼續更新scrapy爬取美女圖片系列文章，可是發現使用免費的代理ip都非常不穩定，有時候連線上，有時候連線不上，所以我想找到穩定的代理ip，下次再更新 scrap

用python爬蟲下載20張圖片到本地

資料全都是寫死的，有需要可以自行修改。 import requests from lxml import etree base_url = "https://unsplash.com/search/photos/flower" headers = {"User-Agent":

愛奇藝下載的盜墓筆記視訊怎麼轉換成mp4格式

《盜墓筆記》是南派三叔所著的小說，堪稱近年來中國出版界的經典之作，獲得百萬讀者狂熱追捧。南派三叔也憑此作名滿天下，躋身中國超級暢銷書作家行列。不僅如此，後來《盜墓筆記》還被改變成了電視，也是深受追捧，但是好像是愛奇藝獨播視訊，所以要想在其他播放器播放就很不容易，所以需要將下載的qsv視訊轉換為MP4格式。

python 編寫爬蟲常用包下載地址、工具網站以及相關安裝問題集合（持續更新）

轉載請標明出處，謝謝。以下連結出現問題請私戳或留言，我儘快解決。免費代理ip網站: http://www.xicidaili.com/nn/ geckodriver 下載地址: https://github.com/mozilla/geckodrive

python例項2-寫一個爬蟲下載小功能

主要是通過url，和re兩個模組對一個網頁的固定圖片進行模糊匹配後下載下來。 #! /usr/bin/python import re import urllib def gethtml(url):

python爬蟲下載網站磁力連結

設計分三步走： 1.獲取明星列表地址 2.獲取明星作品序列號 3.根據作品序列號查詢磁力連結一、獲取網站中明星列表的作品集地址 #coding=utf8 import requests import re import xlrd import xlwt import

盜墓筆記第一季全(12集)下載地址

主演有著有限公司 taobao div 技術分享 target 5.0 img 很多其它：http://www.webyang.net/Html/web/article_149.html 《盜墓筆記》是2014年歡瑞世紀影視傳媒股份有限公司出品的一部網絡季

python爬蟲--下載煎蛋網妹子圖到本地

eve 元素 download down find .get fault log arc 1 #coding:utf-8 2 """ 3 下載煎蛋妹子到本地，通過selenium、正則表達式、phantomjs、Beautifulsoup實現 4 """ 5

Python開發爬蟲之BeautifulSoup解析網頁篇：爬取安居客網站上北京二手房數據

澳洲 pytho 目標 www. 委托 user info .get web 目標：爬取安居客網站上前10頁北京二手房的數據，包括二手房源的名稱、價格、幾室幾廳、大小、建造年份、聯系人、地址、標簽等。網址為：https://beijing.anjuke.com/sale/

Python爬蟲：Xpath語法筆記

上一個 div 運算符 tar 爬蟲 att 語法 ont tab 常用的路勁表達式：表達式描述實例 nodename 選取nodename節點的所有子節點 xpath(‘//div’) 選取了div節點的所有子節點 / 從根節點選取 xpath

Python爬蟲下載whois server字典和whois自動化查詢

搜狗音樂爬蟲下載python

爬蟲 AC active ref ext json color pattern nbsp import requests import re session = requests.Session() r = session.get(‘http://www.kugou.c

爬蟲高玩教你用Python每秒鐘下載一張高清大圖，快不快？

on() print async tpc 多說 xxx ima 所有 mkdir 如果爬蟲需要展現速度，我覺得就是去下載圖片吧，原本是想選擇去煎蛋那裏下載圖片的，那裏的美女圖片都是高質量的，我稿子都是差不多寫好了的，無奈今天重新看下，妹子圖的入口給關了。至於

python簡單爬蟲筆記

wow write file except .com 下載 app sina retrieve python模擬遊覽器爬取相關頁面 import urllib.request url="http://blog.51cto.com/itstyle/2146899" #模擬

Python爬蟲入門 | 5 爬取小豬短租租房信息

圖片交流 ffffff 信息 jpg http 而已基本 mat 小豬短租是一個租房網站，上面有很多優質的民宿出租信息，下面我們以成都地區的租房信息為例，來嘗試爬取這些數據。小豬短租（成都）頁面：http://cd.xiaozhu.com/1.爬取租房標題按照慣例，

學習筆記-小甲魚Python3學習第一講：我和python的第一次親密接觸

idl print alt 什麽舉例 nag pat 程序員分享測試題： 0、python 是什麽類型的語言？python是一種腳本語言 IDLE 是什麽？是一種python shell，類似於windows的cmd窗口和linux的shell print()

學習筆記-小甲魚Python3學習第二講：用Python設計第一個遊戲

拼接 ilove lov love tab fish ins 小甲魚 cti 測試題：什麽是BIF？built-in function,是python內置函數的意思，python內置了非常多的函數，方便程序員直接調用，快速編寫腳本程序用課堂上小甲魚教的方法數一數

學習筆記-小甲魚Python3學習第五講：閑聊之python數據類型

轉換 water != utf-8 import tex 表示程序 type 數據類型：整型、浮點型、布爾型整型：1、234、54浮點型：12.234、2.3e5 = 230000.0、1.5e-3 = 0.0015布爾型：True、False。True + True 返

學習筆記-小甲魚Python3學習第六講：python之常用操作符

mar 邏輯 .... 運算操作 == 整數 image 臺階 size 常用操作符運算操作符：加+ 減- 乘* 除/ 余% 冪運算** 地板除//比較操作符： < ，> ,<=,>=,==,!=邏輯操作符： and,or,not優先級：冪運算符有點

python 網站爬蟲 下載線上盜墓筆記小說到本地的指令碼

相關推薦

python 網站爬蟲下載線上盜墓筆記小說到本地的指令碼