Python 原生爬蟲例項

阿新 • • 發佈：2020-08-03

1、專案環境

　　Python 3.7

　　MySQL 5.6

　　Pycharm 2020

2、專案目錄

　　crawl_blogs.py 爬蟲主檔案

　　mysql_db.py mysql 連線增刪改工具

　　 result.txt 爬取的內容儲存檔案

　　 uploads 下載的檔案目錄

3、資料庫連線mysql_db.py

import pymysql;

readHost = '127.0.0.1'
writeHost = '110.1.58.75'
userName = 'root'
passWord = 'abc@123456'
dataBase = 'py_db'
charset  
= 'utf8'
#讀寫資料庫 type 1 讀庫 2 寫庫
def run_sql(sql,dbtype=''):
    try:
        if dbtype == '':
            host = readHost
        else:
            host = writeHost
        db = pymysql.connect(host,userName,passWord,dataBase,3306,None,charset)
    except:
        print('資料庫連線失敗，請重試')

    # 使用 cursor() 方法建立一個遊標物件 cursor 

    cursor = db.cursor()
    result = ()
    try:
        # 執行sql語句
        cursor.execute(sql)
        if dbtype == 'fetchall':
            # 獲取所有記錄列表
            result = cursor.fetchall()
        elif dbtype == 'fetchone':
            # 使用 fetchone() 方法獲取單條資料.
            result = cursor.fetchone()
         
elif dbtype == '':
            # 提交到資料庫執行 增，刪，改
            db.commit()
    except:
        db.rollback()#發生錯誤時回滾
        print("Error: unable to fetch data")
    # 關閉資料庫連線
    db.close()
    return result

4、爬取程式主檔案 crawl_blogs.py （以爬取部落格為例）

import json
import requests
from requests.exceptions import RequestException
import re
import time
import mysql_db
import urllib
import os
import random

save_path = 'result.txt'

#獲取某一頁的內容
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#處理檔案路徑,相對路徑轉絕對
def get_file_path(file_url,prefix=''):
    if file_url.startswith('/https://') or file_url.startswith('/http://'):
        return file_url.lstrip('/')
    else:
        return prefix+file_url

#解析當前頁html
def parse_one_page(html):
    pattern = re.compile('<div class="row b-one-article">.*?"col-xs-12 col-md-12 col-lg-12".*?<a class="b-oa-title".*?href="(.*?)".*?>(.*?)</a>.*?<li class="col-xs-7 col-md-3 col-lg-3">.*?<i class="fa fa-calendar"></i>(.*?)</li>.*?<div class="col-xs-12 col-md-12 col-lg-12">.*?<img class="bjy-lazyload".*?data-src="(.*?)".*?>.*?</div>', re.S)
    items = re.findall(pattern, html)
    if items:
        for item in items:
            yield {
                'href': item[0],
                'title': item[1],
                'time': item[2].strip()[2:],#去除換行符\n
                # 'img_url':  item[3] if item[3].find('://') != -1 else 'https://baijunyao.com/'+item[3]
                'img_url':  get_file_path(item[3],'https://baijunyao.com/')
            }

#寫入資料到檔案
def write_to_file(content):
    with open(save_path, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

#迴圈爬取
def main(offset):
    url = 'https://baijunyao.com/?page=' + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

#讀取檔案併入庫
def save_to_db(filepath):
    f = open(filepath,'rb')

    # lines = f.readlines()#讀取全部內容
    # for line in lines:
    date = time.strftime('%Y%m%d')
    save_dir = os.path.abspath('.')+'/uploads/article/'+str(date)+'/'
    #目錄不存在 則建立目錄
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    begin_time = time.time()
    i = 0
    for line in f:
        i+=1
        item = line.decode()
        item = json.loads(item)

        href = item['href']
        title = item['title']
        insert_time = item['time']
        timeArray = time.strptime(insert_time, "%y-%m-%d %H:%M:%S")# 轉換成時間陣列
        insert_time = int(time.mktime(timeArray))# 轉換成時間戳
        img_url = item['img_url']
        #下載圖片
        img_suffix = os.path.splitext(img_url)[-1] #獲取檔案字尾
        file_name = str(int(time.time()))+'_'+str(random.randint(1000,9999))+img_suffix
        urllib.request.urlretrieve(img_url,save_dir+file_name)

        sql = "INSERT INTO blogs(title,href,insert_time,update_time,img_url) values('%s','%s',%d,%d,'%s')" %(title,href,insert_time,insert_time,img_url)
        print(sql)
        mysql_db.run_sql(sql)

    end_time = time.time()
    use_time = end_time-begin_time
    print('入庫完畢，共計'+(str)(i)+'條,耗時：'+str(use_time)+'秒')


if __name__ == '__main__':
    #爬取資料並儲存到檔案
    begin_time = time.time()
    for i in range(1,28):
        main(i)
        #time.sleep(1)
    #讀取檔案併入庫
    end_time = time.time()
    use_time = end_time-begin_time
    print('爬取完畢,共耗時：'+str(use_time)+'秒')

    save_to_db(save_path)