Python 爬蟲實戰—盤搜搜
阿新 • • 發佈:2018-05-21
and 一個 ace 共享 urllib ring view 實戰 post
近期公司給了個任務:根據關鍵搜索百度網盤共享文件並下載。
琢磨了幾天寫下了一段簡單的demo代碼,後期優化沒有處理。
主要的思路:(1)根據關鍵字爬取盤搜搜的相關信息
(2)解析並獲取盤搜搜跳轉到百度網盤的URL地址
(3)解析百度網盤獲取真實下載URL然後下載文件
本來下了一段ip代理處理的, 可惜免費爬取的IP時效性差基本上用不了,所以下面給出的是沒有ip代理demo
然而沒有ip代理處理的話, 下載不到幾個文件ip就被封了。所以下面的代碼僅供學習。
# -*- coding=utf-8 -*- import random import time import requestsView Codeimport os import re import urllib import json import string import threading from lxml import etree from urllib import request, parse def get_UserAgent(): ‘‘‘ 返回一個隨機的請求頭 headers ‘‘‘ USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] UserAgent = random.choice(USER_AGENTS) headers = {‘User-Agent‘: UserAgent} return headers def filterType(filename): ‘‘‘ 返回文件類型 ‘‘‘ filter_type = [‘.zip‘, ‘.pdf‘, ‘.doc‘, ‘.docx‘, ‘.xls‘, ‘.xlsx‘, ‘.png‘, ‘.img‘, ‘.rar‘, ‘.txt‘] IsExist = ‘‘ if filename != ‘‘: for item in filter_type: if filename.find(item) != -1: IsExist = item break return IsExist def save_file(downloadUrl, saveFilePath): ‘‘‘ 文件下載1 ‘‘‘ print(‘文件開始下載並保存...‘) try: header_dict = get_UserAgent() with requests.get(downloadUrl, headers=header_dict, timeout=6, stream=True) as web: print(web.status_code) # 為保險起見使用二進制寫文件模式,防止編碼錯誤 with open(saveFilePath, ‘wb‘) as outfile: for chunk in web.iter_content(chunk_size=1024): outfile.write(chunk) print(‘文件下載完成...‘) except Exception as ex: print(ex) def save_file_retrieve(downloadUrl, saveFileName): ‘‘‘‘‘ 文件下載2 ‘‘‘ local = os.path.join(‘D://downLoad//‘, saveFileName) request.urlretrieve(downloadUrl, local, Schedule) def Schedule(a, b, c): ‘‘‘‘‘ a:已經下載的數據塊 b:數據塊的大小 c:遠程文件的大小 ‘‘‘ per = 100.0 * a * b / c if per > 100: per = 100 print(‘%.2f%%‘ % per) def get_file(downloadUrl, saveFilePath): ‘‘‘ 文件下載3 ‘‘‘ try: u = request.urlopen(downloadUrl) print(‘文件開始下載並保存...‘) block_sz = 8192 with open(saveFilePath, ‘wb‘) as f: while True: buffer = u.read(block_sz) if buffer: f.write(buffer) else: break print(‘文件下載完成...‘) except urllib.error.HTTPError: # 碰到了匹配但不存在的文件時,提示並返回 print(downloadUrl, "url file not found") except IOError: print(IOError.message) def getAll_contentForJs(html, re_str): ‘‘‘ 獲取js裏面yunData數據,返回yunData字符串 html: html代碼 re_str: 正則表達式 ‘‘‘ #res_str = r‘yunData.setData\({(.*?)}\)‘ my_js = re.findall(re_str, html, re.S | re.M) jsData = my_js return jsData def getAll_contentFosXpath(html, myxpath): ‘‘‘ 獲取頁面上指定內容 html: html代碼 myxpath: xpath語法 ‘‘‘ myHtml = etree.HTML(html) mydata = myHtml.xpath(myxpath) return mydata def get_postUrl(Jsparams): ‘‘‘ 拼接請求百度網盤真實下載地址post的url地址 ‘‘‘ urlstr = ‘https://pan.baidu.com/api/sharedownload?‘ params = json.loads(Jsparams) urlstr += ‘sign=‘ + str(params.get(‘sign‘)) + ‘‘ urlstr += ‘×tamp=‘ + str(params.get(‘timestamp‘)) + ‘‘ urlstr += ‘&bdstoken=‘ + str(params.get(‘bdstoken‘)) + ‘‘ urlstr += ‘&channel=chunlei‘ urlstr += ‘&clienttype=0‘ urlstr += ‘&web=1‘ urlstr += ‘&app_id=250528‘ return urlstr def get_postData(Jsparams): ‘‘‘ 拼接請求百度網盤真實下載地址post的請求參數 ‘‘‘ postdata = {} params = json.loads(Jsparams) postdata["encrypt"] = 0 postdata["product"] = "share" postdata["uk"] = str(params.get("uk")) postdata["primaryid"] = str(params.get("shareid")) postdata["fid_list"] = "[" + str(params[‘file_list‘][‘list‘][0].get(‘fs_id‘)) + "]" return postdata def get_downLoad(Jsparams): ‘‘‘ 發送post請求獲取真實下載地址 ‘‘‘ print(‘發送post請求獲取真實下載路徑...‘) try: header_dict = get_UserAgent() params = parse.urlencode(get_postData( Jsparams)).encode(encoding=‘UTF8‘) req = request.Request(url=get_postUrl(Jsparams), data=params, headers=header_dict, method="POST") resp = request.urlopen(req) resp = resp.read().decode(encoding=‘utf-8‘) return resp except Exception as ex: print(ex) def get_html(urlLink, headers): ‘‘‘ 獲取頁面代碼html, 同IP多次請求會出現超時現象。 ‘‘‘ try: response = requests.get( url=urlLink, headers=headers, timeout=60) response.encoding = response.apparent_encoding if response.status_code == 200: return response.text except urllib.request.URLError as e: print(‘URLError! The bad Msg is %s‘ % e) return None except urllib.request.HTTPError as e: print(‘HTTPError! The bad Msg is %s‘ % e) return None except Exception as e: print(‘Unknown Errors! The bad Msg is %s ‘ % e) return None def get_redirects(urlLink, headers): try: response = requests.get( url=urlLink, headers=headers, timeout=60, allow_redirects=False) return response.headers[‘Location‘] except urllib.request.URLError as e: print(‘URLError! The bad Msg is %s‘ % e) return None except urllib.request.HTTPError as e: print(‘HTTPError! The bad Msg is %s‘ % e) return None except Exception as e: print(‘Unknown Errors! The bad Msg is %s ‘ % e) return None def baiDuShare(bdUrl): try: print(‘解析盤搜搜詳情頁‘) header_dict = get_UserAgent() shareHtml = get_html(bdUrl, header_dict) if shareHtml != None: ‘‘‘ 解析網站數據獲取百度網盤共享文件URL ‘‘‘ # 共享文件名稱 share_file = getAll_contentFosXpath( shareHtml, ‘//*[@id="con"]/div/div[1]/h1‘) fileName = share_file[0].text # 共享文件大小 share_size = getAll_contentForJs( shareHtml, ‘<dd>文件大小:(.*?)MB</dd>‘) # 百度網盤共享地址 share_link = getAll_contentForJs( shareHtml, ‘a=go&url=(.*?)&t=‘) share_url = ‘http://to.pansoso.com/?a=to&url=‘ + share_link[0] panRedirects = get_redirects(share_url, header_dict) if panRedirects != None: # 獲取文件對應類型 print(panRedirects) print(fileName) FirtHtml = get_html(panRedirects, header_dict) share_type = filterType(fileName) MyJS = getAll_contentForJs( FirtHtml, r‘yunData.setData\({(.*?)}\)‘) StrMyJS = ‘{‘ + MyJS[0] + ‘}‘ DownLink = json.loads(get_downLoad(StrMyJS)) print(DownLink[‘list‘][0].get(‘dlink‘)) save_file(DownLink[‘list‘][0].get(‘dlink‘), ‘D://downLoad//‘ + str(fileName).replace(share_type, ‘‘) + share_type) # 有些文件後綴不在標題的最後,所以將它替換為空再在最後加上文件後綴 else: print(‘百度共享盤解析失敗‘) else: print(‘盤搜搜詳情頁失敗‘) except Exception as e: print(‘Unknown Errors! The bad Msg is %s ‘ % e) return None if __name__ == ‘__main__‘: headers = get_UserAgent() # 定制請求頭 targeturl = ‘http://www.pansoso.com‘ headers["Host"] = "www.pansoso.com" headers["Accept-Language"] = "zh-CN,zh;q=0.9" searchStr = input(‘請輸入關鍵字:‘) searchUrl = ‘http://www.pansoso.com/zh/%s‘ % searchStr searchUrl = request.quote(searchUrl, safe=string.printable) print(‘開始搜索【%s】網盤共享: %s‘ % (searchStr, searchUrl)) try: time.sleep(random.random() * 10) panSosoHtml = get_html(searchUrl, headers) if panSosoHtml != None: panSosoTitle = getAll_contentFosXpath( panSosoHtml, ‘//div[@id="content"]/div[@class="pss"]/h2/a‘) baiduthreads = [] for titleItem in panSosoTitle: # 篩選出文件類型以及關鍵字匹配的 共享文件 if filterType(titleItem.text) != ‘‘ and str(titleItem.text).find(searchStr) != -1: print(targeturl + titleItem.attrib[‘href‘]) Urlparam = targeturl + titleItem.attrib[‘href‘] t = threading.Thread( target=baiDuShare, args=(Urlparam,)) baiduthreads.append(t) for s in baiduthreads: # 開啟多線程爬取 s.start() time.sleep(random.random() * 10) for e in baiduthreads: # 等待所有線程結束 e.join() else: print(‘請求失敗‘) except Exception as e: print(‘Unknown Errors! The bad Msg is %s ‘ % e)
Python 爬蟲實戰—盤搜搜