一個爬取52破解的全部帖子地址的簡單爬蟲
阿新 • • 發佈:2018-03-28
軟件調試 ict print __main__ 逆向 慶典活動 exception requests 總頁數
1 # -*- coding:utf-8 -*- 2 import requests 3 from bs4 import BeautifulSoup 4 import time 5 6 7 8 title_list=[{‘原創發布區‘:‘http://www.52pojie.cn/forum-2-1.html‘}, 9 {‘逆向資源區‘:‘http://www.52pojie.cn/forum-4-1.html‘}, 10 {‘脫殼破解區‘:‘http://www.52pojie.cn/forum-5-1.html‘}, 11 {‘動畫發布區‘:‘http://www.52pojie.cn/forum-6-1.html‘}, 12 {‘懸賞問答區‘:‘http://www.52pojie.cn/forum-8-1.html‘}, 13 {‘水漫金山‘:‘http://www.52pojie.cn/forum-10-1.html‘}, 14 {‘站點公告‘:‘http://www.52pojie.cn/forum-13-1.html‘}, 15 {‘精品軟件區‘:‘http://www.52pojie.cn/forum-16-1.html‘}, 16 {‘音樂視頻‘:‘http://www.52pojie.cn/forum-19-1.html‘}, 17 {‘編程語言區‘:‘http://www.52pojie.cn/forum-24-1.html‘}, 18 {‘申請專區‘:‘http://www.52pojie.cn/forum-25-1.html‘}, 19 {‘LCG Area‘:‘http://www.52pojie.cn/forum-28-1.html‘}, 20 {‘病毒分析區‘:‘http://www.52pojie.cn/forum-32-1.html‘}, 21 {‘周年慶典活動專區‘:‘https://www.52pojie.cn/forum-36-1.html‘}, 22 {‘招聘求職‘:‘http://www.52pojie.cn/forum-39-1.html‘}, 23 {‘病毒樣本區‘:‘http://www.52pojie.cn/forum-40-1.html‘}, 24 {‘安全工具區‘:‘http://www.52pojie.cn/forum-41-1.html‘}, 25 {‘電子書策劃制作區‘:‘http://www.52pojie.cn/forum-42-1.html‘}, 26 {‘Key|Patch|共享賬號‘:‘http://www.52pojie.cn/forum-44-1.html‘}, 27 {‘病毒救援區‘:‘http://www.52pojie.cn/forum-50-1.html‘}, 28 {‘影視推薦‘:‘http://www.52pojie.cn/forum-56-1.html‘}, 29 {‘LSG Area‘:‘http://www.52pojie.cn/forum-58-1.html‘}, 30 {‘軟件調試區‘:‘http://www.52pojie.cn/forum-59-1.html‘}, 31 {‘T恤活動作品區‘:‘http://www.52pojie.cn/forum-62-1.html‘}, 32 {‘移動安全區‘:‘http://www.52pojie.cn/forum-65-1.html‘}, 33 {‘福利經驗‘:‘http://www.52pojie.cn/forum-66-1.html‘}, 34 {‘2014CrackMe大賽‘:‘http://www.52pojie.cn/forum-67-1.html‘}, 35 {‘吾愛破解2016安全挑戰賽‘:‘http://www.52pojie.cn/forum-71-1.html‘}, 36 {‘站務處理‘:‘http://www.52pojie.cn/forum-72-1.html‘}] 37 38 39 40 41 42 43 def get_html(url): 44 while True: 45 try: 46 response = requests.get(url) 47 return response.text 48 except Exception as e: 49 time.sleep(10) 50 continue 51 52 53 # 得到區域總頁數 54 def get_page(url): 55 html = get_html(url) 56 soup = BeautifulSoup(html,‘lxml‘) 57 label_list =soup.find_all(‘label‘) 58 page = int(label_list[3].span.string[3:-2]) 59 return page 60 61 # 下載指定頁面 62 def page_down(url): 63 64 page = get_page(url) 65 print("總頁數:"+str(page)) 66 txt = input("請輸入保存到的文件名(註意添加後綴):") 67 for j in range(1,page+1): 68 print(("第"+str(j)+"頁下載中").center(40,"■")) 69 html = get_html(url[:-7]+‘-‘+str(j)+‘.html‘) 70 soup = BeautifulSoup(html,‘lxml‘) 71 label_list =soup.find_all(‘label‘) 72 a_list =soup.find_all(‘a‘,attrs={‘class‘:‘s xst‘}) 73 #寫入到文件 74 for a in a_list: 75 #print(a.string) 76 #print("https://www.52pojie.cn/"+a.attrs[‘href‘]) 77 with open(txt,‘a+‘,encoding=‘utf-8‘) as f: 78 f.write(a.get_text()) 79 f.write(‘\n‘) 80 f.write("https://www.52pojie.cn/"+a.attrs[‘href‘]) 81 f.write(‘\n‘) 82 83 print(("第"+str(j)+"頁下載完成").center(40,"■")) 84 85 def main(): 86 i = 0 87 time = 0 88 url = ‘‘ 89 # 輸出列表 90 for title in title_list: 91 #print(title) 92 for key in title: 93 url = str(title[key]) 94 if time==1: 95 print((str(i)+‘:‘+key).ljust(20)) 96 time=0 97 98 else: 99 print((str(i)+‘:‘+key).ljust(20),end=" ") 100 time+=1 101 i+=1 102 103 # 判斷輸入是否在範圍內 104 while True: 105 try: 106 print() 107 num = int(input(‘請輸入你要瀏覽的代號:‘)) 108 if num>28 or num<0: 109 print(‘輸入有誤請重新輸入‘) 110 continue 111 else: 112 break 113 except Exception as e: 114 print(‘輸入有誤請重新輸入‘) 115 continue 116 # 獲得區域鏈接 117 dict_t = title_list[num] 118 for key in dict_t: 119 print(dict_t[key]) 120 page_down(dict_t[key]) 121 122 if __name__ == ‘__main__‘: 123 main()
一個爬取52破解的全部帖子地址的簡單爬蟲