python爬蟲—練習題(re,request&BeautifulSoup,selenium)
阿新 • • 發佈:2019-02-17
一、使用 正則 獲取51job職位資訊
網頁分析(
python3.x
環境)
import re #匯入re模組 import xlwt import chardet from urllib import request import random def getHtml(url): # 獲取網頁內容 USER_AGENTS = [] # 瀏覽器(末尾附瀏覽器) proxies = [] # 代理IP(末尾附IP) req = request.Request(url) # 設定url地址
req.add_header('User-Agent', random.choice(USER_AGENTS)) # 隨機選取瀏覽器 proxy_support = request.ProxyHandler({"http": random.choice(proxies)}) # 隨機選取IP地址 opener = request.build_opener(proxy_support) # 獲取網站訪問的物件 request.install_opener(opener) res = request.urlopen(req) # 處理瀏覽器返回的物件html = res.read() return html def get_Datalist(page_number, jobname): # 網址分析 URL = "https://search.51job.com/list/020000,000000,0000,00,9,99," \ +urllib.parse.quote(jobname)+",2," + str(page_number) + ".html?lang=c&stype=&postchannel\ =0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=\ 99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=\ 9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="html = getHtml(URL) # 傳入需要分析網頁 code = chardet.detect(html)["encoding"] # 獲取網頁編碼 html = html.decode(code,'replace').encode("utf-8") # 解編碼,轉成utf-8編碼 # 設定正則表示式 reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?' r'<span class="t2"><a target="_blank" title="(.*?)" .*?' r'<span class="t3">(.*?)</span>.*?' r'<span class="t4">(.*?)</span>.*?' r'<span class="t5">(.*?)</span>', re.S) result = re.findall(reg, html.decode("utf8",'replace')) #replace:替換非法字元 return result datalist = [] # 全域性資料列表 def solve_data(page_number, jobname): # 向全域性變數新增資料 global datalist for k in range(int(page_number)): # 設定頁數,迴圈獲取 data = get_Datalist(k + 1, jobname) for i in data: datalist.append(i) def save_Excel(jobname, filename): # 設定儲存函式 book = xlwt.Workbook(encoding="utf-8") # 建立工作簿 sheet = book.add_sheet("51job" + str(jobname) + "職位資訊") col = ('職位名', '公司名', '工作地點', '薪資', '釋出時間') for i in range(len(col)): sheet.write(0, i, col[i]) for i in range(len(datalist)): # 控制行 for j in range(len(datalist[i])): # 控制列 sheet.write(i + 1, j, datalist[i][j]) book.save(u'51job' + filename + u'職位資訊.xls') def main(jobname, page_number, filename): solve_data(page_number, jobname) save_Excel(jobname, filename) main(u"機器學習工程師", "2", u"機器學習職業1") # 爬取職業,爬取多少頁碼,儲存檔名
二、使用 正則 爬取智聯招聘
import requests
import chardet
import xlwt
from urllib import request
import random
import re
def getHtml(): #獲取連結
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; \
.NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;\
.NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko)\
Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732;\
.NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; \
SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; \
Media Center PC 6.0; .NET4.0C; .NET4.0E)"
] # 瀏覽器
proxies = [{"HTTP":"117.63.78.64:6666"},
{"HTTPS":"114.225.169.215:53128"},
{"HTTPS":"222.185.22.108:6666"}] # 代理IP
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E4%B8%8A%E6%B5%B7&kw=AI&p=1&isadv=0"
r=requests.get(url,
headers={"User-Agent":random.choice(USER_AGENTS)},
proxies=random.choice(proxies))
code = chardet.detect(r.content)["encoding"]
return r.content.decode(code)
DataList = []
def Parser_HTML():
html = getHtml()
reg = re.compile(r'<td class="zwmc" style="width: 250px;">.*?(<b>AI</b>)(.*?)</a>.*?'
r'<td class="gsmc"><a href=".*?" target="_blank">(.*?)</a>.*?'
r'<td class="zwyx">(.*?)</td>.*?'
r'<td class="gzdd">(.*?)</td>.*?', re.S)
result = re.findall(reg,html)
for i in result:
DataList.append(i)
print(DataList)
return DataList
def Save_Excel():
wbk = xlwt.Workbook(encoding="utf-8")
sheet1 = wbk.add_sheet("AI職位薪資資訊")
field = ("職位名稱","公司名稱","職位薪資","工作地點")
for i in range(len(field)):
sheet1.write(0,i,field[i])
for j in range(len(DataList)):
for k in range(len(field)):
sheet1.write(j+1,k,DataList[j][k])
wbk.save("AI職業2.xls")
def main():
Parser_HTML()
Save_Excel()
main()
三、使用requests & BeautifulSoup 抓取 豆瓣電影Top250
網頁分析(
Python3.x
環境)
import requests from bs4 import BeautifulSoup import chardet import re import xlwt import time def getHtml(index): # 獲取某頁的內容 USER_AGENTS = [ ... ] # 瀏覽器列表 proxies = [{"HTTP":"117.63.78.64:6666"}, {"HTTPS":"114.225.169.215:53128"}, {"HTTPS":"222.185.22.108:6666"}] # 代理IP(臨時的) print('正在抓取第',index+1,'頁資訊') url = 'https://movie.douban.com/top250?start='+str(index*25)+'&filter=' r = requests.get(url, headers={"User-Agent":random.choice(USER_AGENTS)}, proxies=random.choice(proxies)) code = chardet.detect(r.content)['encoding'] return r.content.decode(code) reg = re.compile('.*(\d{4}).*') #獲取年份正則 def getData(n): datalist = [] for step in range(n): global reg time.sleep(0.2) html = getHtml(step) soup = BeautifulSoup(html,'html.parser') parent = soup.find('div',attrs={'id':'content'}) #父節點 lis = parent.find_all('li') #獲取所有li for li in lis: data = [] film_name = li.find('div',attrs={'class':'hd'}).find('span').get_text() data.append(film_name) #獲取電影名稱 film_time_str = li.find('div',attrs={'class':'bd'}).find('p').get_text() film_time = re.findall(reg,film_time_str)[0] data.append(film_time) # 獲取上映時間 film_score = li.find('div',attrs={'class':'star'}).\ find_all('span')[1].get_text() data.append(film_score) # 獲取電影評分 person_number = li.find('div',attrs={'class':'star'}).\ find_all('span')[3].get_text() number = re.findall(re.compile('\d*'),person_number)[0] data.append(number) #獲取評價人數 # 獲取 簡評,因為有個別沒有簡評標籤,所以加判斷 if li.find('div',attrs={'class':'bd'}).\ find('p',attrs={'class':'quote'}): evaluate = li.find('div',attrs={'class':'bd'}).\ find('p',attrs{'class':'quote'}).find('span').get_text() else: evaluate = '' data.append(evaluate) datalist.append(data) #存入datalist return datalist def saveToExcel(n,fileName): #儲存到excel book = xlwt.Workbook() sheet = book.add_sheet('豆瓣電影Top250') data=getData(n) col = ('電影名稱','上映年份','電影評分','評分人數','電影簡評') for k,v in enumerate(col): #寫入首行 sheet.write(0, k, v) for i,each in enumerate(data): #寫入電影資料 for j,value in enumerate(each): sheet.write(i+1, j, value) book.save(fileName) saveToExcel(10,'豆瓣.xls') #設定獲取頁碼,命名 print('結束')
四、使用requests&BeautifulSoup爬取西刺代理網站
分析網頁
(python 3.x環境)
import requests from bs4 import BeautifulSoup import re import chardet import random data_dic_http=[] data_dic_https =[] # 定義函式獲取天數至少大於10天的代理IP,n代表獲取兩種代理IP至少分別為n個,預設值為5 def get_IP(n=5): userAgent = [……] # 瀏覽器列表 proxies = [{"HTTP": "117.63.78.64:6666"}, {"HTTPS": "222.185.22.108:6666"}] # 代理IP(臨時的) url = "http://www.xicidaili.com/" r = requests.get(url, headers={"User-Agent":random.choice(USER_AGENTS)}, proxies=random.choice(proxies)) code = chardet.detect(r.content)["encoding"] html=r.content.decode(code) soup=BeautifulSoup(html,"html.parser") parentTable=soup.find("table",attrs={"id":"ip_list"}) trs=parentTable.find_all("tr") for i in range(2): #刪除標題(兩行) trs.pop(0) for each in trs: #迴圈查詢 # data_dic_http={"http":[]} # data_dic_https = {"https": []} if each.find_all("td"): tds=each.find_all("td") reg=re.compile("(\d+)天") # print(tds[6]) days=re.findall(reg,tds[6].string) if days: if tds[5].string=="HTTPS" and int(days[0])>=10: data_dic_https.append(tds[1].string+":"+tds[2].string) elif tds[5].string=="HTTP" and int(days[0])>=10: data_dic_http.append(tds[1].string + ":" + tds[2].string) else: continue else: continue if len(data_dic_http)>=n and len(data_dic_http)>=n: break return data_dic_http,data_dic_https http_list,https_list=get_IP(10) # 獲取兩個代理IP列表,且每個列表種至少10個10天以上的IP print(http_list) >>> ['222.185.22.247:6666', '123.134.87.136:61234', '14.118.255.8:6666', \ '117.67.11.136:8118', '115.28.90.79:9001', '112.115.57.20:3128', \ '123.57.217.208:3128', '222.185.22.247:6666', '123.134.87.136:61234', '14.118.255.8:6666'] print(https_list) >>> ['115.204.25.93:6666', '120.78.78.141:8888', '1.196.161.172:9999', \ '121.231.32.205:6666', '122.72.18.35:80', '101.37.79.125:3128', \ '118.212.137.135:31288', '120.76.231.27:3128', '122.72.18.34:80', \ '115.204.25.93:6666', '1.196.161.172:9999', '121.231.32.205:6666', \ '122.72.18.35:80', '101.37.79.125:3128', '118.212.137.135:31288', \ '120.76.231.27:3128', '122.72.18.34:80']
四、使用BeautifulSoup爬取美女圖片
import requests
import chardet
from bs4 import BeautifulSoup
import random
from urllib import request
import os
import time
def getHtml(number): #獲取連結
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; \
.NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;\
.NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1;\
.NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; \
.NET4.0C; .NET4.0E)"
] # 瀏覽器
proxies = [{"HTTP":"117.63.78.64:6666"},
{"HTTPS":"114.225.169.215:53128"},
{"HTTPS":"222.185.22.108:6666"}] # 代理IP
url = "http://www.27270.com/ent/meinvtupian/list_11_"+str(number+1)+".html"
r=requests.get(url,
headers={"User-Agent":random.choice(USER_AGENTS)},
proxies=random.choice(proxies))
code = chardet.detect(r.content)["encoding"]
return r.content.decode(code)
imgList=[] # 全域性變數(用來儲存圖片資訊)
def Get_ImgData(pageNum): # 獲取圖片資訊
for k in range(pageNum):
time.sleep(2)
html = getHtml(pageNum)
soup = BeautifulSoup(html, "html.parser")
Parents = soup.find("div",{'class':"MeinvTuPianBox"})
img = Parents.find_all("img")
for i in img:
imgList.append(i)
return imgList
def SaveFile(name): #儲存檔案
if os.path.exists(name): #建立資料夾
os.rmdir("photos")
else:
os.mkdir(name)
os.chdir(name)
for img in imgList:
image_name = img['alt']
suffix = img['src']
print("圖片名:",image_name)
print("圖片連結:", suffix)
request.urlretrieve(suffix,image_name+str('.jpg')) #強制轉成jpg
return
def main(pageNum,name): #主函式
Get_ImgData(pageNum)
SaveFile(name)
main(2,'美女_01') #獲取頁碼,檔名
五、使用selenium爬取網易雲音樂歌單
from selenium import webdriver
import xlwt
url = 'https://music.163.com/#/discover/toplist'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
driver.switch_to.frame('contentFrame')
# 切換視窗失敗的;先獲取frame物件,再切換
# myframe = driver.find_elements_by_tag_name('contentFrame')
# driver.switch_to.frame(myframe)
parents = driver.find_element_by_id("song-list-pre-cache")
table = parents.find_elements_by_tag_name("table")[0]
tbody = table.find_elements_by_tag_name("tbody")[0]
trs = tbody.find_elements_by_tag_name('tr')
SongList = []
for each in trs:
song_Num = each.find_elements_by_tag_name("td")[0].text
song_Name = each.find_elements_by_tag_name("td")[1].\
find_elements_by_tag_name('b')[0].get_attribute('title')
song_time = each.find_elements_by_tag_name("td")[2].text
singer = each.find_elements_by_tag_name("td")[3].\
find_elements_by_tag_name('div')[0].get_attribute('title')
SongList.append([song_Num,song_Name,song_time,singer])
#print(SongList)
book = xlwt.Workbook(encoding="utf-8") # 建立工作簿
sheet = book.add_sheet("Netcloud_song")
col = ('排名', '歌名', '歌曲時長', '歌手')
for i in range(len(col)):
sheet.write(0, i, col[i])
for i in range(len(SongList)): # 控制行
for j in range(len(SongList[i])): # 控制列
sheet.write(i + 1, j, SongList[i][j])
book.save(u'網易雲音樂.xls')