python3爬蟲-快速入門-爬取圖片和標題
阿新 • • 發佈:2018-07-16
瀏覽器 ebr tle path requests itl edi 大致 應用
直接上代碼,先來個爬取豆瓣圖片的,大致思路就是發送請求-得到響應數據-儲存數據,原理的話可以先看看這個
https://www.cnblogs.com/sss4/p/7809821.html
import os#同來創造文件夾 import requests#發送請求和得到響應用的 from bs4 import BeautifulSoup#用來解析回應的數據 def GetHtmlText(url):#得到響應數據 try: r = requests.get(url)#發送url r.raise_for_status()#判斷是否成功 r.encoding = ‘utf-8‘#設置編碼格式 return r.text#返回他的響應數據 except: return ‘‘ def main(pages): filepath=os.getcwd()+‘\爬的圖片\\‘#創造一個文件夾 if not os.path.exists(filepath):#如果沒有則創造 os.makedirs(filepath) pagenum=pages#要爬取的頁數 fnum=1 for page in range(pages): url="https://movie.douban.com/celebrity/1048000/photos/?type=C&start="+str(page*30)+‘&sortby=like&size=a&subtype=a‘#第幾頁 html=GetHtmlText(url) soup=BeautifulSoup(html,‘html.parser‘)#html。parser是解析器 uls=soup.find_all(‘ul‘,class_="poster-col3 clearfix")#從響應的數據中找到ul class是xxxx的數據 for ul in uls: imgs=ul.find_all(‘img‘) #找到img的標簽 for img in imgs: imgurl=img[‘src‘]#得到img的url imgcontent=requests.get(imgurl).content#得到這個url下的內容content,應該是二進制的 filename=str(fnum)+‘.jpg‘ with open(filepath+filename,‘wb‘) as wf:#二進制形式寫入數據 wf.write(imgcontent) fnum+=1 if __name__ == ‘__main__‘: main(9)
再來個爬去標題類的
import requests from bs4 import BeautifulSoup url="http://www.jianshu.com" headers={‘User-Agent‘:‘SE 2.X MetaSr 1.0‘}#設置請求頭的User-Agent,理解的話可以認為是從哪個瀏覽器發出的,不然的話會被反爬蟲 page=requests.get(url=url,headers=headers) page_info=page.text page_bf=BeautifulSoup(page_info,‘html.parser‘) #print(page_bf.prettify()) titles=page_bf.find_all(‘a‘,‘title‘) for title in titles: print(title.string) print(‘http://www.jianshu.com‘+title.get(‘href‘)) with open(r"D:\untitled\爬蟲爬到的標題.txt","w",encoding=‘utf-8‘) as file: for title in titles: file.write(title.string+‘\n‘) file.write("http://www.jianshu.com"+title.get(‘href‘)+‘\n\n‘)
這個是下載小說的---(別人的代碼)
from bs4 import BeautifulSoup import requests,sys class downloader(object): def __init__(self): self.server="http://www.biqukan.com/" self.target="http://www.biqukan.com/1_1094" self.name=[] self.urls=[] self.nums=0 def get_download_url(self): req=requests.get(url=self.target) html=req.text div_bf=BeautifulSoup(html) div=div_bf.find_all(‘div‘,class_=‘listmain‘) a_bf=BeautifulSoup(str(div[0])) a=a_bf.find_all(‘a‘) self.nums=len(a[15:]) for each in a[15:]: self.name.append(each.string) self.urls.append(self.server+each.get(‘href‘)) def get_contents(self ,target): req=requests.get(url=target) html=req.text bf=BeautifulSoup(html) texts=bf.find_all(‘div‘,class_=‘showtxt‘) texts=texts[0].text.replace(‘\xa0‘*8,‘\n\n‘) return texts def writer(self,name,path,text): write_flag=True with open(path,"a",encoding=‘utf-8‘) as f: f.write(name+‘\n‘) f.writelines(text) f.write(‘\n\n‘) dl=downloader() dl.get_download_url() print("開始下載") for i in range(dl.nums): dl.writer(dl.name[i], ‘一念永恒.txt‘, dl.get_contents(dl.urls[i])) sys.stdout.write(" 已下載:%.3f%%" % float(i/dl.nums) + ‘\r‘) sys.stdout.flush() print(‘《一年永恒》下載完成‘)
python3爬蟲-快速入門-爬取圖片和標題