1. 程式人生 > 實用技巧 >Python 原生爬蟲例項

Python 原生爬蟲例項

1、專案環境

  Python 3.7

  MySQL 5.6

  Pycharm 2020

2、專案目錄

  

  crawl_blogs.py 爬蟲主檔案

  mysql_db.py mysql 連線 增刪改工具

   result.txt 爬取的內容 儲存檔案

   uploads 下載的檔案目錄

3、資料庫連線mysql_db.py

import pymysql;

readHost = '127.0.0.1'
writeHost = '110.1.58.75'
userName = 'root'
passWord = 'abc@123456'
dataBase = 'py_db'
charset 
= 'utf8' #讀寫資料庫 type 1 讀庫 2 寫庫 def run_sql(sql,dbtype=''): try: if dbtype == '': host = readHost else: host = writeHost db = pymysql.connect(host,userName,passWord,dataBase,3306,None,charset) except: print('資料庫連線失敗,請重試') # 使用 cursor() 方法建立一個遊標物件 cursor
cursor = db.cursor() result = () try: # 執行sql語句 cursor.execute(sql) if dbtype == 'fetchall': # 獲取所有記錄列表 result = cursor.fetchall() elif dbtype == 'fetchone': # 使用 fetchone() 方法獲取單條資料. result = cursor.fetchone()
elif dbtype == '': # 提交到資料庫執行 增,刪,改 db.commit() except: db.rollback()#發生錯誤時回滾 print("Error: unable to fetch data") # 關閉資料庫連線 db.close() return result

4、爬取程式主檔案 crawl_blogs.py (以爬取部落格為例)

import json
import requests
from requests.exceptions import RequestException
import re
import time
import mysql_db
import urllib
import os
import random

save_path = 'result.txt'

#獲取某一頁的內容
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#處理檔案路徑,相對路徑轉絕對
def get_file_path(file_url,prefix=''):
    if file_url.startswith('/https://') or file_url.startswith('/http://'):
        return file_url.lstrip('/')
    else:
        return prefix+file_url

#解析當前頁html
def parse_one_page(html):
    pattern = re.compile('<div class="row b-one-article">.*?"col-xs-12 col-md-12 col-lg-12".*?<a class="b-oa-title".*?href="(.*?)".*?>(.*?)</a>.*?<li class="col-xs-7 col-md-3 col-lg-3">.*?<i class="fa fa-calendar"></i>(.*?)</li>.*?<div class="col-xs-12 col-md-12 col-lg-12">.*?<img class="bjy-lazyload".*?data-src="(.*?)".*?>.*?</div>', re.S)
    items = re.findall(pattern, html)
    if items:
        for item in items:
            yield {
                'href': item[0],
                'title': item[1],
                'time': item[2].strip()[2:],#去除換行符\n
                # 'img_url':  item[3] if item[3].find('://') != -1 else 'https://baijunyao.com/'+item[3]
                'img_url':  get_file_path(item[3],'https://baijunyao.com/')
            }

#寫入資料到檔案
def write_to_file(content):
    with open(save_path, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

#迴圈爬取
def main(offset):
    url = 'https://baijunyao.com/?page=' + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

#讀取檔案併入庫
def save_to_db(filepath):
    f = open(filepath,'rb')

    # lines = f.readlines()#讀取全部內容
    # for line in lines:
    date = time.strftime('%Y%m%d')
    save_dir = os.path.abspath('.')+'/uploads/article/'+str(date)+'/'
    #目錄不存在 則建立目錄
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    begin_time = time.time()
    i = 0
    for line in f:
        i+=1
        item = line.decode()
        item = json.loads(item)

        href = item['href']
        title = item['title']
        insert_time = item['time']
        timeArray = time.strptime(insert_time, "%y-%m-%d %H:%M:%S")# 轉換成時間陣列
        insert_time = int(time.mktime(timeArray))# 轉換成時間戳
        img_url = item['img_url']
        #下載圖片
        img_suffix = os.path.splitext(img_url)[-1] #獲取檔案字尾
        file_name = str(int(time.time()))+'_'+str(random.randint(1000,9999))+img_suffix
        urllib.request.urlretrieve(img_url,save_dir+file_name)

        sql = "INSERT INTO blogs(title,href,insert_time,update_time,img_url) values('%s','%s',%d,%d,'%s')" %(title,href,insert_time,insert_time,img_url)
        print(sql)
        mysql_db.run_sql(sql)

    end_time = time.time()
    use_time = end_time-begin_time
    print('入庫完畢,共計'+(str)(i)+'條,耗時:'+str(use_time)+'')


if __name__ == '__main__':
    #爬取資料並儲存到檔案
    begin_time = time.time()
    for i in range(1,28):
        main(i)
        #time.sleep(1)
    #讀取檔案併入庫
    end_time = time.time()
    use_time = end_time-begin_time
    print('爬取完畢,共耗時:'+str(use_time)+'')

    save_to_db(save_path)

5、爬取結果

6、下載的部落格封面圖

7、資料表儲存