python爬某視訊網站將網站下所有視訊連結儲存到TXT檔案
阿新 • • 發佈:2018-12-15
import re import requests import time #巨集定義 #title = 'https://8*8*5*r*i*.com' title = 'http://www.gaoqing.la/' txtRoute = 'D:\\MySeGF\\' contextGF = [] logList = [] failFlag = 0 logPrintDebug = 0 #函式定義 def getTime(): #curTime = time.strftime('%Y.%m.%d',time.localtime(time.time())) curTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) IsWriteLog('curTime:',curTime) return curTime def getFileTime(): #curTime = time.strftime('%Y.%m.%d',time.localtime(time.time())) curTime = time.strftime('%Y%m%d_%H%M%S',time.localtime(time.time())) IsWriteLog('curTime:',curTime) return curTime def IsWriteLog(first, *args): if 0==logPrintDebug: print(first, end=' ') logList.append(first) if 1==logPrintDebug: print(first, end=' ') for v in args: if 0 == logPrintDebug: print(v) logList.append(v) if 1 == logPrintDebug: print(v) def serchChildHtml(secTitle): global failFlag try: respose = requests.get(secTitle) respose.encoding = 'utf-8' IsWriteLog('respose.status_code:',respose.status_code)# 響應的狀態碼 #print('respose.content:',respose.content) #返回位元組資訊 #print('respose.text:',respose.text) #返回文字內容 titleB = re.findall(r'<title>(.*?)</title>', respose.text, re.S)[0] IsWriteLog('titleB:', titleB) contextGF.append(titleB + ':' + secTitle) urls=re.findall(r'class="text-overflow".*?href="(.*?)"',respose.text,re.S) #re.S 把文字資訊轉換成1行匹配 IsWriteLog('urls:',urls) index=0 for cont in urls: url = title + cont IsWriteLog('url:',url) result = requests.get(url) result.encoding = 'utf-8' IsWriteLog('result.status_code:', respose.status_code) mp4_url = re.findall(r'.*?download.*?"(.*?)"', result.text, re.S)[0] IsWriteLog('mp4_url:', mp4_url) #fileName = mp4_url.split('/')[-1] fileName = re.findall(r'.*?var downurls.*?"(.*?)高清下載', result.text, re.S)[0] IsWriteLog(fileName) index = index + 1 onelist = str(index).rjust(3,' ')+ ' : ' + fileName.ljust(10,' ') + ' ' + '\n' + mp4_url getTime() IsWriteLog('onelist:',onelist) contextGF.append(onelist) except: failFlag = failFlag + 1 def writeToTxt(context,flag): global failFlag #ticks = time.time() TetDown = txtRoute + str(getFileTime()) + '_' + str(flag) + '.txt' IsWriteLog(TetDown) try: file_handle = open(TetDown,mode='w') for html in context: file_handle.write(html) file_handle.write('\n') file_handle.close() except: failFlag = failFlag + 1 def writeToLocal(): writeToTxt(contextGF,str(failFlag)) if 0 != failFlag: writeToTxt(logList,'log') if (__name__ == "__main__"): contextGF.append(getTime()) IsWriteLog(contextGF) beginRes=requests.get(title) contextGF.append(title) IsWriteLog('beginRes.status_code:',beginRes.status_code)# 響應的狀態碼 titleL=re.findall(r'class=""><a.*?href=.*?"(.*?)".*?target="_blank"',beginRes.text,re.S) #re.S 把文字資訊轉換成1行匹配 IsWriteLog('titleL:',titleL) for shortUrl in titleL: res = re.match('/html', shortUrl) res1 = re.match('/html/news', shortUrl) if None != res: if None == res1: IsWriteLog(shortUrl) secTitle = title + shortUrl IsWriteLog('secTitle:', secTitle) contextGF.append(secTitle) serchChildHtml(secTitle) time.sleep(5.5) # 休眠1秒 writeToLocal()