python爬蟲下載網站磁力連結
阿新 • • 發佈:2019-02-03
設計分三步走:
1.獲取明星列表地址
2.獲取明星作品序列號
3.根據作品序列號查詢磁力連結
一、獲取網站中明星列表的作品集地址
二、獲取明星作品的番號#coding=utf8 import requests import re import xlrd import xlwt import time from bs4 import BeautifulSoup #新建excel表格用於儲存資料 myfile=xlwt.Workbook() table=myfile.add_sheet(u"資訊",cell_overwrite_ok=True) table.write(0,0,u"名字") table.write(0,1,u"連結") user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ' headers = { 'User-Agent' : user_agent } class geturl(): def __init__(self,page): self.page = page def get_url(self): for p in range(1,self.page+1): url = 'https://avso.pw/cn/actresses/page/'+str(p) r = requests.get(url,headers=headers) html = r.text #print html soup = BeautifulSoup(html) i = (p-1)*50 + 1 for tag in soup.find_all(href=re.compile("https://avso.pw/cn/star")): #print tag.attrs['href'] table.write(i,1,tag.attrs['href']) i += 1 j = (p-1)*50 +1 for tag in soup.find_all(class_='photo-info'): for gg in tag.find_all('span'): #print gg.string table.write(j,0,gg.string) j += 1 print u"完成讀取第%s頁資訊"%p test = geturl(2) test.get_url() filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"url.xlsx" myfile.save(filename) print u"完成%s的url備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
#coding=utf8 import requests import re import xlrd import xlwt import time import ConfigParser from bs4 import BeautifulSoup user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ' headers = { 'User-Agent' : user_agent } myfile=xlwt.Workbook() wtable=myfile.add_sheet(u"資訊",cell_overwrite_ok=True) wtable.write(0,0,u"名字") wtable.write(0,1,u"連結") wtable.write(0,2,u"番號") class getserial(): def get_serial(self): data = xlrd.open_workbook('url.xls') table = data.sheets()[0] nrows = table.nrows for j in range(nrows): try: cf = ConfigParser.ConfigParser() cf.read("liao.ini") p = cf.getint('num','p') if j == 0: continue else: url = table.cell(j,1).value r = requests.get(url,headers=headers) html = r.text soup = BeautifulSoup(html) i = 0 for tag in soup.find_all('date'): if i%2 == 0: #print tag.string wtable.write(p,2,tag.string) wtable.write(p,0,table.cell(j,0).value) wtable.write(p,1,table.cell(j,1).value) p += 1 i+=1 print j cf.set("num", "p", p) cf.write(open("liao.ini", "w")) except: filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx" myfile.save(filename) print u"出現異常自動儲存%s的番號備份"%time.strftime('%Y%m%d%H%M%S',time.localtime()) test = getserial() test.get_serial() filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx" myfile.save(filename) print u"完成%s的番號備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
三、根據番號查詢對應的磁力連結
#coding=utf8 import requests import re import xlrd import xlwt import time import ConfigParser import threading from bs4 import BeautifulSoup user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36' headers = { 'Accept':'text/css,*/*;q=0.1', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'User-Agent' : user_agent , } class getlink(): def get_link(self,conf,excel): myfile=xlwt.Workbook() wtable=myfile.add_sheet(u"資訊",cell_overwrite_ok=True) wtable.write(0,0,u"名字") wtable.write(0,1,u"番號") wtable.write(0,2,u"檔案大小") wtable.write(0,3,u"檔案更新日期") wtable.write(0,4,u"連結") wtable.write(0,5,u"磁力連結") data = xlrd.open_workbook(excel) table = data.sheets()[0] nrows = table.nrows for j in range(nrows): try: cf = ConfigParser.ConfigParser() cf.read(conf) p = cf.getint('num','p') if j == 0: continue else: serial = table.cell(j,2).value url = 'https://btso.pw/search/' + serial #print url r = requests.get(url,headers=headers,timeout=30) html = r.text #print html soup = BeautifulSoup(html) for tag in soup.find_all('div',class_='row'): for gg in tag.find_all(class_='col-sm-2 col-lg-1 hidden-xs text-right size'): print gg.string wtable.write(p,0,table.cell(j,0).value) wtable.write(p,1,table.cell(j,2).value) wtable.write(p,2,gg.string) for aa in tag.find_all(class_='col-sm-2 col-lg-2 hidden-xs text-right date'): print aa.string wtable.write(p,3,aa.string) for xx in tag.find_all(href=re.compile("https://btso.pw/magnet/detail/hash")): print xx.attrs['href'] wtable.write(p,4,xx.attrs['href']) r1 = requests.get(xx.attrs['href'],headers=headers,timeout=30) html1 = r1.text #print html1 soup1 = BeautifulSoup(html1) for tag1 in soup1.find_all('textarea',id='magnetLink'): print tag1.string wtable.write(p,5,tag1.string) p += 1 cf.set("num", "p", p) cf.write(open(conf, "w")) except: filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls" myfile.save(filename) print u"出現異常自動儲存%s的磁力連結備份"%time.strftime('%Y%m%d%H%M%S',time.localtime()) filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls" myfile.save(filename) print u"自動儲存%s的磁力連結備份"%time.strftime('%Y%m%d%H%M%S',time.localtime()) if __name__ == '__main__': test = getlink() threads = [] t1 = threading.Thread(target=test.get_link,args=('link1.ini','serial1.xls',)) threads.append(t1) t2 = threading.Thread(target=test.get_link,args=('link2.ini','serial2.xls',)) threads.append(t2) t3 = threading.Thread(target=test.get_link,args=('link3.ini','serial3.xls',)) threads.append(t3) t4 = threading.Thread(target=test.get_link,args=('link4.ini','serial4.xls',)) threads.append(t4) t5 = threading.Thread(target=test.get_link,args=('link5.ini','serial5.xls',)) threads.append(t5) t6 = threading.Thread(target=test.get_link,args=('link6.ini','serial6.xls',)) threads.append(t6) for t in threads: t.setDaemon(True) t.start() t.join() print u"完成所有程序"
磁力連結丟到迅雷就可以下載了。
看看最後的excel: