python 爬蟲爬取小說 單程序與多程序 學習
阿新 • • 發佈:2019-02-04
1 單程序:
# -*- coding:UTF-8 -*- from bs4 import BeautifulSoup import requests, sys """ 類說明:下載《筆趣看》網小說《一念永恆》 """ class downloader(object): def __init__(self): self.server = 'http://www.biqukan.com/' self.target = 'http://www.biqukan.com/1_1094/' self.names = [] # 存放章節名 self.urls = [] # 存放章節連結 self.nums = 0 # 章節數 """ 函式說明:獲取下載連結 """ def get_download_url(self): html=requests.get(self.target).text bs=BeautifulSoup(html,'html.parser') div=bs.find_all('div',class_='listmain') bs=BeautifulSoup(str(div[0]),'html.parser') list=bs.find_all('a') self.nums=len(list[15:]) # print(list) for each in list[15:]: self.names.append(each.string) self.urls.append(self.server+each.get('href')) def get_content(self,target): html=requests.get(url=target).text bs=BeautifulSoup(html,'html.parser') div=bs.find_all('div',class_='showtxt') texts=div[0].text.replace('\xa0' * 8, '\n\n') return texts def writer(self): f=open('一念永恆.txt','a',encoding='utf-8') for i in range(self.nums) : f.write(self.names[i]) f.write('\n') f.writelines(self.get_content(self.urls[i])) f.write('\n\n') print(i) f.close() if __name__ == "__main__": dl = downloader() dl.get_download_url() dl.writer() # print('《一年永恆》開始下載:') # print('《一年永恆》下載完成')
2 多程序
注意 dl.names[i] 為string ,pool中 args 不能傳遞 (我也不知道為什麼) 所以加了個str
# -*- coding:UTF-8 -*- from bs4 import BeautifulSoup import requests, sys from multiprocessing import Pool import string headers={ 'Cookie':r'UM_distinctid=164fd71debe478-0aed9f594fffa9-3c604504-1fa400-164fd71debf4c2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260938422=2084872231-1533260238-%7C1533551125', 'Host':'www.biqukan.com', #'Referer':r'http://www.biqukan.com/1_1094/', 'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } class downloader(object): def __init__(self): self.server = 'http://www.biqukan.com/' self.target = 'http://www.biqukan.com/1_1094/' self.names = [] # 存放章節名 self.urls = [] # 存放章節連結 self.nums = 0 # 章節數 def get_download_url(self): req = requests.get(url=self.target,headers=headers) html = req.text div_bf = BeautifulSoup(html,"html.parser") div = div_bf.find_all('div', class_='listmain') a_bf = BeautifulSoup(str(div[0]),"html.parser") a = a_bf.find_all('a') self.nums = len(a[15:]) # 剔除不必要的章節,並統計章節數 for each in a[15:]: self.names.append(each.string) self.urls.append(self.server + each.get('href')) def get_contents(target): req = requests.get(url=target) html = req.text bf = BeautifulSoup(html,"html.parser") texts = bf.find_all('div', class_='showtxt') texts = texts[0].text.replace('\xa0' * 8, '\n\n') return texts def writer(name,path,texts): ans=get_contents(texts) with open(path, 'a', encoding='utf-8') as f: f.write(name + '\n') f.writelines(ans) f.write('\n\n') if __name__ == "__main__": pool = Pool(15) dl = downloader() dl.get_download_url() print('《一年永恆》開始下載:') for i in range(dl.nums): #print('1',dl.get_contents('http://www.biqukan.com/1_1094/5403177.html')) pool.apply_async(writer,args=(str(dl.names[i]),'一念永恆.txt',dl.urls[i])) #pool.apply_async(writer, args=(str(dl.names[i]), '一念永恆.txt', dl.urls[i])) pool.close() pool.join() print('《一年永恆》下載完成')