1. 程式人生 > >python爬某視訊網站將網站下所有視訊連結儲存到TXT檔案

python爬某視訊網站將網站下所有視訊連結儲存到TXT檔案

import re
import requests
import time

#巨集定義
#title = 'https://8*8*5*r*i*.com'
title = 'http://www.gaoqing.la/'
txtRoute = 'D:\\MySeGF\\'
contextGF = []
logList = []
failFlag = 0
logPrintDebug = 0
#函式定義
def getTime():
   #curTime = time.strftime('%Y.%m.%d',time.localtime(time.time()))
   curTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
   IsWriteLog('curTime:',curTime)
   return curTime
def getFileTime():
   #curTime = time.strftime('%Y.%m.%d',time.localtime(time.time()))
   curTime = time.strftime('%Y%m%d_%H%M%S',time.localtime(time.time()))
   IsWriteLog('curTime:',curTime)
   return curTime
def IsWriteLog(first, *args):
    if 0==logPrintDebug:
        print(first, end=' ')
        logList.append(first)
    if 1==logPrintDebug:
        print(first, end=' ')
    for v in args:
        if 0 == logPrintDebug:
            print(v)
            logList.append(v)
        if 1 == logPrintDebug:
            print(v)
def serchChildHtml(secTitle):
    global failFlag
    try:
        respose = requests.get(secTitle)
        respose.encoding = 'utf-8'
        IsWriteLog('respose.status_code:',respose.status_code)# 響應的狀態碼
        #print('respose.content:',respose.content)  #返回位元組資訊
        #print('respose.text:',respose.text)  #返回文字內容
        titleB = re.findall(r'<title>(.*?)</title>', respose.text, re.S)[0]
        IsWriteLog('titleB:', titleB)
        contextGF.append(titleB + ':' + secTitle)
        urls=re.findall(r'class="text-overflow".*?href="(.*?)"',respose.text,re.S)  #re.S 把文字資訊轉換成1行匹配
        IsWriteLog('urls:',urls)
        index=0
        for cont in urls:
            url = title + cont
            IsWriteLog('url:',url)
            result = requests.get(url)
            result.encoding = 'utf-8'
            IsWriteLog('result.status_code:', respose.status_code)
            mp4_url = re.findall(r'.*?download.*?"(.*?)"', result.text, re.S)[0]
            IsWriteLog('mp4_url:', mp4_url)
            #fileName = mp4_url.split('/')[-1]
            fileName = re.findall(r'.*?var downurls.*?"(.*?)高清下載', result.text, re.S)[0]
            IsWriteLog(fileName)
            index = index + 1
            onelist  = str(index).rjust(3,' ')+ ' : ' + fileName.ljust(10,' ') + '  ' + '\n' + mp4_url
            getTime()
            IsWriteLog('onelist:',onelist)
            contextGF.append(onelist)
    except:
        failFlag = failFlag + 1
def writeToTxt(context,flag):
    global failFlag
    #ticks = time.time()
    TetDown = txtRoute + str(getFileTime()) + '_' + str(flag) + '.txt'
    IsWriteLog(TetDown)
    try:
        file_handle = open(TetDown,mode='w')
        for html in context:
            file_handle.write(html)
            file_handle.write('\n')
        file_handle.close()
    except:
        failFlag = failFlag + 1
def writeToLocal():
    writeToTxt(contextGF,str(failFlag))
    if 0 != failFlag:
        writeToTxt(logList,'log')
if (__name__ == "__main__"):
    contextGF.append(getTime())
    IsWriteLog(contextGF)
    beginRes=requests.get(title)
    contextGF.append(title)
    IsWriteLog('beginRes.status_code:',beginRes.status_code)# 響應的狀態碼
    titleL=re.findall(r'class=""><a.*?href=.*?"(.*?)".*?target="_blank"',beginRes.text,re.S)  #re.S 把文字資訊轉換成1行匹配
    IsWriteLog('titleL:',titleL)
    for shortUrl in titleL:
        res = re.match('/html', shortUrl)
        res1 = re.match('/html/news', shortUrl)
        if None != res:
            if None == res1:
                IsWriteLog(shortUrl)
                secTitle = title + shortUrl
                IsWriteLog('secTitle:', secTitle)
                contextGF.append(secTitle)
                serchChildHtml(secTitle)
                time.sleep(5.5)  # 休眠1秒
    writeToLocal()