Python 原生爬蟲例項
阿新 • • 發佈:2020-08-03
1、專案環境
Python 3.7
MySQL 5.6
Pycharm 2020
2、專案目錄
crawl_blogs.py 爬蟲主檔案
mysql_db.py mysql 連線 增刪改工具
result.txt 爬取的內容 儲存檔案
uploads 下載的檔案目錄
3、資料庫連線mysql_db.py
import pymysql; readHost = '127.0.0.1' writeHost = '110.1.58.75' userName = 'root' passWord = 'abc@123456' dataBase = 'py_db' charset= 'utf8' #讀寫資料庫 type 1 讀庫 2 寫庫 def run_sql(sql,dbtype=''): try: if dbtype == '': host = readHost else: host = writeHost db = pymysql.connect(host,userName,passWord,dataBase,3306,None,charset) except: print('資料庫連線失敗,請重試') # 使用 cursor() 方法建立一個遊標物件 cursorcursor = db.cursor() result = () try: # 執行sql語句 cursor.execute(sql) if dbtype == 'fetchall': # 獲取所有記錄列表 result = cursor.fetchall() elif dbtype == 'fetchone': # 使用 fetchone() 方法獲取單條資料. result = cursor.fetchone()elif dbtype == '': # 提交到資料庫執行 增,刪,改 db.commit() except: db.rollback()#發生錯誤時回滾 print("Error: unable to fetch data") # 關閉資料庫連線 db.close() return result
4、爬取程式主檔案 crawl_blogs.py (以爬取部落格為例)
import json import requests from requests.exceptions import RequestException import re import time import mysql_db import urllib import os import random save_path = 'result.txt' #獲取某一頁的內容 def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None #處理檔案路徑,相對路徑轉絕對 def get_file_path(file_url,prefix=''): if file_url.startswith('/https://') or file_url.startswith('/http://'): return file_url.lstrip('/') else: return prefix+file_url #解析當前頁html def parse_one_page(html): pattern = re.compile('<div class="row b-one-article">.*?"col-xs-12 col-md-12 col-lg-12".*?<a class="b-oa-title".*?href="(.*?)".*?>(.*?)</a>.*?<li class="col-xs-7 col-md-3 col-lg-3">.*?<i class="fa fa-calendar"></i>(.*?)</li>.*?<div class="col-xs-12 col-md-12 col-lg-12">.*?<img class="bjy-lazyload".*?data-src="(.*?)".*?>.*?</div>', re.S) items = re.findall(pattern, html) if items: for item in items: yield { 'href': item[0], 'title': item[1], 'time': item[2].strip()[2:],#去除換行符\n # 'img_url': item[3] if item[3].find('://') != -1 else 'https://baijunyao.com/'+item[3] 'img_url': get_file_path(item[3],'https://baijunyao.com/') } #寫入資料到檔案 def write_to_file(content): with open(save_path, 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') #迴圈爬取 def main(offset): url = 'https://baijunyao.com/?page=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) #讀取檔案併入庫 def save_to_db(filepath): f = open(filepath,'rb') # lines = f.readlines()#讀取全部內容 # for line in lines: date = time.strftime('%Y%m%d') save_dir = os.path.abspath('.')+'/uploads/article/'+str(date)+'/' #目錄不存在 則建立目錄 if not os.path.exists(save_dir): os.makedirs(save_dir) begin_time = time.time() i = 0 for line in f: i+=1 item = line.decode() item = json.loads(item) href = item['href'] title = item['title'] insert_time = item['time'] timeArray = time.strptime(insert_time, "%y-%m-%d %H:%M:%S")# 轉換成時間陣列 insert_time = int(time.mktime(timeArray))# 轉換成時間戳 img_url = item['img_url'] #下載圖片 img_suffix = os.path.splitext(img_url)[-1] #獲取檔案字尾 file_name = str(int(time.time()))+'_'+str(random.randint(1000,9999))+img_suffix urllib.request.urlretrieve(img_url,save_dir+file_name) sql = "INSERT INTO blogs(title,href,insert_time,update_time,img_url) values('%s','%s',%d,%d,'%s')" %(title,href,insert_time,insert_time,img_url) print(sql) mysql_db.run_sql(sql) end_time = time.time() use_time = end_time-begin_time print('入庫完畢,共計'+(str)(i)+'條,耗時:'+str(use_time)+'秒') if __name__ == '__main__': #爬取資料並儲存到檔案 begin_time = time.time() for i in range(1,28): main(i) #time.sleep(1) #讀取檔案併入庫 end_time = time.time() use_time = end_time-begin_time print('爬取完畢,共耗時:'+str(use_time)+'秒') save_to_db(save_path)
5、爬取結果
6、下載的部落格封面圖
7、資料表儲存