1. 程式人生 > 實用技巧 >12.9 爬蟲最後一步

12.9 爬蟲最後一步

爬取最後一步,總是報錯,自己很崩潰,也找不到問題所在,問小組成員也沒人知道

最後請教老師才知道是多建立一個遊標

錯誤程式碼

from bs4 import BeautifulSoup
import requests
import time
import pymysql
class DB:
    def __init__(self,host='',port=3306,user='',password='',db='',charset='utf8'):
        self.conn=pymysql.connect(host=host,user=user,port=port,password=password,database=db,charset=charset)
        self.cur
=self.conn.cursor() def __enter__(self): return self.cur def __exit__(self, exc_type, exc_val, exc_tb): self.conn.commit() self.cur.close() self.conn.close() def dataUrl(url): # url='http://www.d3zww.com/book/5/5663/' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.52
'} html = requests.get(url, headers=header) html.raise_for_status() html.encoding= html.apparent_encoding bs = BeautifulSoup(html.text, 'html.parser') return bs def dataHtml(bs, db): xname=bs.find('div',{'class':'book_info'}).find('h1').string zt=bs.find('div',{'class':"
book_list"}).find_all('li')#小說名 # print(xname) # time.sleep(3) for i in zt: zname = i.find('a').get_text().strip()#章節名稱 link=i.find('a')['href']#章節連結 #print(zhangjiename,zhangjielink) urll='http://www.d3zww.com/' fullUrl=urll+link # 'http://www.d3zww.com/'+抓取出的章節連結 link=fullUrl print(xname,zname,link) # print(fullUrl) # 完整的網頁 para = [xname,zname,link] db.execute('insert into zhangjie(xname,zname,link) values(%s,%s,%s)', para) def main(db): db.execute('select link from xiaoshuo ') # 查詢欄位 result = DB.fetchall() # 獲取所有資料 # print(result) for s in (result): # url = (''.format#(result) x = '-'.join(s) print(x) url = x.format(s) bs = dataUrl(url) dataHtml(bs,db) time.sleep(2) if __name__ == '__main__': with DB(host='localhost',user='root',password='root',db='shixun') as db: db.execute('SET NAMES utf8') main(db)

正確程式碼

from bs4 import BeautifulSoup
import requests
import time
import pymysql
class DB:
    def __init__(self,host='',port=3305,user='',password='',db='',charset='utf8'):
        self.conn=pymysql.connect(host=host,user=user,port=port,password=password,database=db,charset=charset)
        self.cur=self.conn.cursor()

    def __enter__(self):
        return self.cur

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.conn.commit()
        self.cur.close()
        self.conn.close()

def dataUrl(url):
    # url='http://www.d3zww.com/book/5/5663/'
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55'}
    html = requests.get(url, headers=header)
    html.raise_for_status()
    html.encoding= html.apparent_encoding
    bs = BeautifulSoup(html.text, 'html.parser')
    return bs

def dataHtml(bs, db):
    xname=bs.find('div',{'class':'book_info'}).find('h1').string
    zt=bs.find('div',{'class':"book_list"}).find_all('li')#小說名
    # print(xname)
    time.sleep(3)
    for i in zt:
        zhangjiename = i.find('a').get_text().strip()#章節名稱
        zhangjielink=i.find('a')['href']#章節連結
        #print(zhangjiename,zhangjielink)
        urll='http://www.d3zww.com/'
        fullUrl=urll+zhangjielink   # 'http://www.d3zww.com/'+抓取出的章節連結
        print(xname,zhangjiename,fullUrl)
        # print(fullUrl) # 完整的網頁
        para = [xname,zhangjiename, zhangjielink]
        db.execute('insert into zhangjie(xname,zname,link) values(%s,%s,%s)', para)

def main(db):
    for i in range(3731,3741):
        url = "http://www.d3zww.com/book/3/{}/".format(i)
        bs = dataUrl(url)
        dataHtml(bs,db)
        time.sleep(2)

if __name__ == '__main__':
    with DB(host='localhost',user='root',password='root',db='shixun') as db:
        db.execute('SET NAMES utf8')
        main(db)

兩者相比較就會發現兩個程式碼沒多大區別,第二個程式碼比第一個少兩行程式碼,也就是以下兩行

多一個遊標就會出現錯誤,

就好比一個人結婚一樣,第一個已經領了結婚證,第二個再去肯定不會讓辦的。

解釋一下什麼是遊標

遊標(cursor):系統為使用者開通的一個數據緩衝區,用於存放SQL語句執行結果。使用者使用的sql會逐一的在遊標中獲取記錄,並賦值給主變數,交由Python進一步處理,一組主變數只能存放一條記錄。

告誡一下網路小蜘蛛打程式碼一定要認真,注意事項太多啦,爬蟲真是博大精深!