1. 程式人生 > >爬蟲:輸入網頁之後爬取當前頁面的圖片和背景圖片,最後打包成exe

爬蟲:輸入網頁之後爬取當前頁面的圖片和背景圖片,最後打包成exe

環境:py3.6

核心庫:selenium(考慮到通用性,js載入的網頁)、pyinstaller

顏色顯示:colors.py

colors.py

 用於在命令列輸出文字時,帶有顏色,可有可無。

# -*- coding:utf-8 -*-#

# filename: prt_cmd_color.py

import ctypes, sys

STD_INPUT_HANDLE = -10
STD_OUTPUT_HANDLE = -11
STD_ERROR_HANDLE = -12

# 字型顏色定義 text colors
FOREGROUND_BLUE = 0x09  # blue.
FOREGROUND_GREEN = 0x0a  #
green. FOREGROUND_RED = 0x0c # red. FOREGROUND_YELLOW = 0x0e # yellow. # 背景顏色定義 background colors BACKGROUND_YELLOW = 0xe0 # yellow. # get handle std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE) def set_cmd_text_color(color, handle=std_out_handle): Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color)
return Bool # reset white def resetColor(): set_cmd_text_color(FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE) # green def printGreen(mess): set_cmd_text_color(FOREGROUND_GREEN) sys.stdout.write(mess) resetColor() # red def printRed(mess): set_cmd_text_color(FOREGROUND_RED) sys.stdout.write(mess) resetColor()
# yellow def printYellow(mess): set_cmd_text_color(FOREGROUND_YELLOW) sys.stdout.write(mess + '\n') resetColor() # white bkground and black text def printYellowRed(mess): set_cmd_text_color(BACKGROUND_YELLOW | FOREGROUND_RED) sys.stdout.write(mess + '\n') resetColor() if __name__ == '__main__': printGreen('printGreen:Gree Color Text') printRed('printRed:Red Color Text') printYellow('printYellow:Yellow Color Text')

spider.py

主要在於通用性的處理

# -*- coding: utf-8 -*-
## import some modules
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from os import  path
import requests
import re
from urllib.parse import urlparse, urljoin
from colors import  *
d = path.dirname(__file__)
bar_length = 20
def output(List, percent, msg ,url):
    hashes = '#' * int(percent / len(List) * bar_length)
    spaces = ' ' * (bar_length - len(hashes))
    loadingStr = str(int(100 * percent / len(List)))+ u'%'
    length = len('100%')
    if len(loadingStr) < length:
        loadingStr += ' '*(length-len(loadingStr))
    sys.stdout.write("\rPercent: [%s %s]" % (hashes + spaces, loadingStr ))
    printYellow("         [%s] %s " % ( msg, url))
    sys.stdout.flush()
    time.sleep(2)

class Spider():
    '''spider class'''
    def __init__(self):
        self.url = 'https://www.cnblogs.com/cate/csharp/#p5'
        self.checkMsg = ''
        self.fileName = path.join(d, 'image/')
        self.fileDirName = ''
        self.chrome_options = Options()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
        self.topHostPostfix = (
        '.com', '.la', '.io', '.co', '.info', '.net', '.org', '.me', '.mobi',
        '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn',
        '.org.cn', '.mx', '.tv', '.ws', '.ag', '.com.ag', '.net.ag',
        '.org.ag', '.am', '.asia', '.at', '.be', '.com.br', '.net.br',
        '.bz', '.com.bz', '.net.bz', '.cc', '.com.co', '.net.co',
        '.nom.co', '.de', '.es', '.com.es', '.nom.es', '.org.es',
        '.eu', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in',
        '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms',
        '.com.mx', '.nl', '.nu', '.co.nz', '.net.nz', '.org.nz',
        '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw',
        '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg', ".com.hk")

    def inputUrl(self):
        '''input url'''
        self.url = input('please input your target: ')
        print('[*] url: %s' % self.url)

    def check(self):
        '''check url'''
        self.checkMsg = input('Are your sure to grab this site? [Y/N/Exit] :')
        if self.checkMsg == 'Y':
            self.middle = self.url.replace('http://', '')
            self.middle = self.middle.replace('https://', '')
            self.fileDirName = path.join(d, 'image/%s' % self.middle)
            self.makeFile()
            self.parse()
        elif self.checkMsg == 'N':
            self.inputUrl()
            self.check()
        elif self.checkMsg == 'Exit':
            sys.exit()
        else:
            print('please input one of [Y/N/Exit]!!')
            self.check()

    def makeFile(self):
        '''建立資料夾函式'''
        if os.path.exists(self.fileName):
            pass
        else:
            os.makedirs(self.fileName)

        if os.path.exists(self.fileDirName):
            pass
        else:
            os.makedirs(self.fileDirName)

    def getCssImage(self,url):
        '''獲取css中的image'''
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
        try:
            response = requests.get(url, headers = headers, timeout=500).text
            bgCssList = re.findall("url\((.*?\))", response)
            bgCssSrc = []
            if len(bgCssList) > 0:
                for v in bgCssList:
                    v = v.replace('url(', '')
                    v = v.replace('\\', "")
                    v = v.replace(')', "")
                    print(v)
                    print('-----------------------------------')
                    bgCssSrc.append(v)
            return bgCssSrc
        except:
            print('connection timeout!!!')

    def getHostName(self, url):
        '''獲取url主域名'''
        regx = r'[^\.]+(' + '|'.join([h.replace('.', r'\.') for h in self.topHostPostfix]) + ')$'
        pattern = re.compile(regx, re.IGNORECASE)
        parts = urlparse(self.url)
        host = parts.netloc
        m = pattern.search(host)
        urlm = 'http://www.' + m.group() if m else host
        return urlm

    def joinUrl(self, url):
        '''圖片url處理'''
        # if url[:2] == '//':
        #     url = url.replace('//', '')
        #     url = 'http://' + url
        # elif url.startswith('/'):
        #     ## 需要處理
        #     regx = r'[^\.]+(' + '|'.join([h.replace('.', r'\.') for h in self.topHostPostfix]) + ')$'
        #     pattern = re.compile(regx, re.IGNORECASE)
        #     parts = urlparse(self.url)
        #     host = parts.netloc
        #     m = pattern.search(host)
        #     urlm = 'http://www.' + m.group() if m else host
        #     url = urlm + url
        # try:
        #     ## 處理字串   獲取 www   http   https
        #     if url[:2] == '//':
        #         url = url.replace('//', '')
        #         url = 'http://' + url
        #     elif url.startswith('/'):
        #         ## 需要處理
        #         regx = r'[^\.]+(' + '|'.join([h.replace('.', r'\.') for h in self.topHostPostfix]) + ')$'
        #         pattern = re.compile(regx, re.IGNORECASE)
        #         parts = urlparse(self.url)
        #         host = parts.netloc
        #         m = pattern.search(host)
        #         urlm = 'http://www/' + m.group() if m else host
        #         url = urlm + url
        #     else:
        #         try:
        #             url = url.split('www', 1)[1]
        #             url = u'http://www' + url
        #         except:
        #             try:
        #                 url = url.split('http', 1)[1]
        #                 url = u'http' + url
        #             except:
        #                 pass
        # except:
        #     pass
        ## ex1    '//example.png'
        ## ex2    'http://'
        if url.startswith('http'):
            return url
        else:
            return urljoin(self.url, url)

    def download(self, key, url):
        if key == 0:
            pass
        else:
            print('')
        url = self.joinUrl(url)
        try:
            imgType = os.path.split(url)[1]
            imgType = imgType.split('.',1)[1]
            imgType = imgType.split('?',1)[0]
        except:
            msg = u' Error '
            return msg
        fileName = int(time.time())
        path = self.fileDirName+ u'/'+str(fileName) + u'.' + imgType
        try:
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
            try:
                response = requests.get(url, headers=headers, timeout=500).content
            except:
                msg = u' Error '
                return msg
            f = open(path, 'wb+')
            try:
                f.write(response.encode('utf-8'))
            except:
                f.write(response)
            f.close()
        except Exception as e:
            msg = u' Error '
            return msg
        return u'Success'

    def parse(self):
        '''parse html'''
        self.driver.get(self.url)
        time.sleep(3)
        html_content = self.driver.page_source
        bs = BeautifulSoup(html_content, "html.parser")
        ## 先獲取所有的圖片
        imgList = bs.find_all('img')
        srcList = []
        if len(imgList) > 0:
            for v in imgList:
                srcList.append(v['src'])
                print(v['src'])
                print('-----------------------------------')
            srcList = list(set(srcList))
        print('[*] Find %s image in page',len(srcList))
        ## 獲取當前頁面style裡面的背景圖
        bgStyleList = re.findall("url\((.*?\))", html_content)
        bgSrc = []
        if len(bgStyleList) > 0:
            for v in bgStyleList:
                v = v.replace('url(', '')
                v = v.replace('\\',"")
                v = v.replace(')', "")
                print(v)
                print('-----------------------------------')
                bgSrc.append(v)
        bgSrc = list(set(bgSrc))
        print('[*] Find %s image in Page style', len(bgSrc))
        ## 獲取所有的背景圖
        ## 獲取所有的css檔案
        cssList = re.findall('<link rel="stylesheet" href="(.*?)"',html_content)
        cssImageUrls = []
        if len(cssList) > 0:
            cssImageUrl = []
            for url in cssList:
                cssImageUrl += self.getCssImage(url)
            cssImageUrls = cssImageUrl
            cssImageUrls = list(set(cssImageUrls))
        print('[*] Find %s image in Page css', len(cssImageUrls))

        ## 開始獲取圖片https://www.cnblogs.com/shuangzikun/
        ## 開始下載標籤的圖片
        print('---------------------------------------------')


        if len(srcList) > 0:
            print('Start Load Image -- %s' % len(srcList))
            for percent,url in enumerate(srcList):
                percent += 1
                msg = self.download(percent, url)
                output(srcList, percent, msg ,url)


        if len(bgSrc) >0:
            print('\nStart Load Image In Style -- %s' % len(bgSrc))
            for percent, url in enumerate(bgSrc):
                percent += 1
                msg = self.download(percent, url)
                output(srcList, percent, msg, url)


        if len(cssImageUrls) > 0:
            print('\nStart Load Image In Css -- %s' % len(cssImageUrls))
            for percent, url in enumerate(cssImageUrls):
                percent += 1
                msg = self.download(percent, url)
                output(srcList, percent, msg, url)

        print('\nEnd----------------------------------Exit')



if __name__ == '__main__':
    print('''          ____  __  __  __      __       _______  _______
         /__  \\ \\_\\/_/ / /     / /____  / ___  / / ___  /
        / /_/ /  \\__/ / /___  / /__  / / /  / / / /  / /
       / ____/   / / / /___/ / /  / / / /__/ / / /  / /
      /_/       /_/ /_/___/ /_/  /_/  \\_____/ /_/  /_/ version 3.6''')
    descriptionL = ['T', 'h', 'i', 's', ' ', 'i', 's' , ' ', 'a', ' ', 's', 'p', 'i', 'd', 'e', 'r', ' ','p', 'r', 'o', 'c', 'e', 'd', 'u', 'r', 'e', ' ', '-', '-', '-',' IMGSPIDER', '\n']

    for j in range(len(descriptionL)):
        sys.stdout.write(descriptionL[j])
        sys.stdout.flush()
        time.sleep(0.1)
    urlL = ['[First Step]', ' input ', 'a', ' url ' , 'as ', 'your ', 'target ~ \n']

    for j in range(len(urlL)):
        sys.stdout.write(urlL[j])
        sys.stdout.flush()
        time.sleep(0.2)
    pathL = ['[Second Step]', ' check ', 'this ', 'url ~\n']

    for j in range(len(pathL)):
        sys.stdout.write(pathL[j])
        sys.stdout.flush()
        time.sleep(0.2)
    ## new spider
    MySpider = Spider()
    ## input url path
    # MySpider.inputUrl()
    # ## checkMsg
    MySpider.check()

執行效果

 打包

使用到其它擴充套件

pyinstaller -f spider.py   打包成單一檔案。

由於要在其它電腦上使用,需要修改下谷歌驅動的位置,把谷歌驅動放在spider.exe的同目錄下。

        try:
            self.chrome_options.add_argument(r"user-data-dir = %s" % path.join('Chrome\Application'))
            self.driver = webdriver.Chrome(path.join(d,'chromedriver.exe'),chrome_options=self.chrome_options)
        except Exception as e:
            print(e)

點選spider.exe,初始化沒有報錯即ok了。