12.9 爬蟲最後一步
阿新 • • 發佈:2020-12-09
爬取最後一步,總是報錯,自己很崩潰,也找不到問題所在,問小組成員也沒人知道
最後請教老師才知道是多建立一個遊標
錯誤程式碼
from bs4 import BeautifulSoup import requests import time import pymysql class DB: def __init__(self,host='',port=3306,user='',password='',db='',charset='utf8'): self.conn=pymysql.connect(host=host,user=user,port=port,password=password,database=db,charset=charset) self.cur=self.conn.cursor() def __enter__(self): return self.cur def __exit__(self, exc_type, exc_val, exc_tb): self.conn.commit() self.cur.close() self.conn.close() def dataUrl(url): # url='http://www.d3zww.com/book/5/5663/' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.52'} html = requests.get(url, headers=header) html.raise_for_status() html.encoding= html.apparent_encoding bs = BeautifulSoup(html.text, 'html.parser') return bs def dataHtml(bs, db): xname=bs.find('div',{'class':'book_info'}).find('h1').string zt=bs.find('div',{'class':"book_list"}).find_all('li')#小說名 # print(xname) # time.sleep(3) for i in zt: zname = i.find('a').get_text().strip()#章節名稱 link=i.find('a')['href']#章節連結 #print(zhangjiename,zhangjielink) urll='http://www.d3zww.com/' fullUrl=urll+link # 'http://www.d3zww.com/'+抓取出的章節連結 link=fullUrl print(xname,zname,link) # print(fullUrl) # 完整的網頁 para = [xname,zname,link] db.execute('insert into zhangjie(xname,zname,link) values(%s,%s,%s)', para) def main(db): db.execute('select link from xiaoshuo ') # 查詢欄位 result = DB.fetchall() # 獲取所有資料 # print(result) for s in (result): # url = (''.format#(result) x = '-'.join(s) print(x) url = x.format(s) bs = dataUrl(url) dataHtml(bs,db) time.sleep(2) if __name__ == '__main__': with DB(host='localhost',user='root',password='root',db='shixun') as db: db.execute('SET NAMES utf8') main(db)
正確程式碼
from bs4 import BeautifulSoup import requests import time import pymysql class DB: def __init__(self,host='',port=3305,user='',password='',db='',charset='utf8'): self.conn=pymysql.connect(host=host,user=user,port=port,password=password,database=db,charset=charset) self.cur=self.conn.cursor() def __enter__(self): return self.cur def __exit__(self, exc_type, exc_val, exc_tb): self.conn.commit() self.cur.close() self.conn.close() def dataUrl(url): # url='http://www.d3zww.com/book/5/5663/' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55'} html = requests.get(url, headers=header) html.raise_for_status() html.encoding= html.apparent_encoding bs = BeautifulSoup(html.text, 'html.parser') return bs def dataHtml(bs, db): xname=bs.find('div',{'class':'book_info'}).find('h1').string zt=bs.find('div',{'class':"book_list"}).find_all('li')#小說名 # print(xname) time.sleep(3) for i in zt: zhangjiename = i.find('a').get_text().strip()#章節名稱 zhangjielink=i.find('a')['href']#章節連結 #print(zhangjiename,zhangjielink) urll='http://www.d3zww.com/' fullUrl=urll+zhangjielink # 'http://www.d3zww.com/'+抓取出的章節連結 print(xname,zhangjiename,fullUrl) # print(fullUrl) # 完整的網頁 para = [xname,zhangjiename, zhangjielink] db.execute('insert into zhangjie(xname,zname,link) values(%s,%s,%s)', para) def main(db): for i in range(3731,3741): url = "http://www.d3zww.com/book/3/{}/".format(i) bs = dataUrl(url) dataHtml(bs,db) time.sleep(2) if __name__ == '__main__': with DB(host='localhost',user='root',password='root',db='shixun') as db: db.execute('SET NAMES utf8') main(db)
兩者相比較就會發現兩個程式碼沒多大區別,第二個程式碼比第一個少兩行程式碼,也就是以下兩行
多一個遊標就會出現錯誤,
就好比一個人結婚一樣,第一個已經領了結婚證,第二個再去肯定不會讓辦的。
解釋一下什麼是遊標
遊標(cursor):系統為使用者開通的一個數據緩衝區,用於存放SQL語句執行結果。使用者使用的sql會逐一的在遊標中獲取記錄,並賦值給主變數,交由Python進一步處理,一組主變數只能存放一條記錄。
告誡一下網路小蜘蛛打程式碼一定要認真,注意事項太多啦,爬蟲真是博大精深!