python爬蟲—練習題（re，request&BeautifulSoup,selenium）

阿新 • • 發佈：2019-02-17

一、使用正則獲取51job職位資訊

網頁分析（`python3.x`環境）

這裡寫圖片描述

import re                       #匯入re模組
import xlwt
import chardet
from urllib import request
import random

def getHtml(url):               # 獲取網頁內容
    USER_AGENTS = []            # 瀏覽器(末尾附瀏覽器)
    proxies = []                # 代理IP（末尾附IP）
    req = request.Request(url)  # 設定url地址 

    req.add_header('User-Agent', random.choice(USER_AGENTS))  # 隨機選取瀏覽器
    proxy_support = request.ProxyHandler({"http": random.choice(proxies)})  # 隨機選取IP地址
    opener = request.build_opener(proxy_support)  # 獲取網站訪問的物件
    request.install_opener(opener)
    res = request.urlopen(req)                    # 處理瀏覽器返回的物件 

    html = res.read()
    return html

def get_Datalist(page_number, jobname):
    # 網址分析
    URL = "https://search.51job.com/list/020000,000000,0000,00,9,99," \
           +urllib.parse.quote(jobname)+",2," + str(page_number) + ".html?lang=c&stype=&postchannel\
           =0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=\
           99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=\
           9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" 

    html = getHtml(URL)                                 # 傳入需要分析網頁
    code = chardet.detect(html)["encoding"]             # 獲取網頁編碼
    html = html.decode(code,'replace').encode("utf-8")  # 解編碼，轉成utf-8編碼
    # 設定正則表示式
    reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?'
                     r'<span class="t2"><a target="_blank" title="(.*?)" .*?'  
                     r'<span class="t3">(.*?)</span>.*?'
                     r'<span class="t4">(.*?)</span>.*?'
                     r'<span class="t5">(.*?)</span>', re.S)
    result = re.findall(reg, html.decode("utf8",'replace'))  #replace:替換非法字元
    return result

datalist = []                              # 全域性資料列表
def solve_data(page_number, jobname):      # 向全域性變數新增資料
    global datalist
    for k in range(int(page_number)):      # 設定頁數，迴圈獲取
        data = get_Datalist(k + 1, jobname)
        for i in data:
            datalist.append(i)

def save_Excel(jobname, filename):          # 設定儲存函式
    book = xlwt.Workbook(encoding="utf-8")  # 建立工作簿
    sheet = book.add_sheet("51job" + str(jobname) + "職位資訊")
    col = ('職位名', '公司名', '工作地點', '薪資', '釋出時間')
    for i in range(len(col)):
        sheet.write(0, i, col[i])
    for i in range(len(datalist)):         # 控制行
        for j in range(len(datalist[i])):  # 控制列
            sheet.write(i + 1, j, datalist[i][j])
    book.save(u'51job' + filename + u'職位資訊.xls')

def main(jobname, page_number, filename):
    solve_data(page_number, jobname)
    save_Excel(jobname, filename)

main(u"機器學習工程師", "2", u"機器學習職業1")  # 爬取職業，爬取多少頁碼，儲存檔名

二、使用正則爬取智聯招聘

import requests
import chardet
import xlwt
from urllib import request
import random
import re

def getHtml():    #獲取連結
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; \
        .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;\
         .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko)\
         Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
        Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732;\
         .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; \
        SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; \
        Media Center PC 6.0; .NET4.0C; .NET4.0E)"
    ]  # 瀏覽器
    proxies = [{"HTTP":"117.63.78.64:6666"},
               {"HTTPS":"114.225.169.215:53128"},
               {"HTTPS":"222.185.22.108:6666"}]  # 代理IP
    url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E4%B8%8A%E6%B5%B7&kw=AI&p=1&isadv=0"
    r=requests.get(url,
                   headers={"User-Agent":random.choice(USER_AGENTS)},
                   proxies=random.choice(proxies))
    code = chardet.detect(r.content)["encoding"]
    return r.content.decode(code)

DataList = []
def Parser_HTML():
    html = getHtml()
    reg = re.compile(r'<td class="zwmc" style="width: 250px;">.*?（<b>AI</b>）(.*?)</a>.*?'
                     r'<td class="gsmc"><a href=".*?" target="_blank">(.*?)</a>.*?'
                     r'<td class="zwyx">(.*?)</td>.*?'
                     r'<td class="gzdd">(.*?)</td>.*?', re.S)
    result = re.findall(reg,html)
    for i in result:
        DataList.append(i)
    print(DataList)
    return DataList

def Save_Excel():
    wbk = xlwt.Workbook(encoding="utf-8")
    sheet1 = wbk.add_sheet("AI職位薪資資訊")
    field = ("職位名稱","公司名稱","職位薪資","工作地點")
    for i in range(len(field)):
        sheet1.write(0,i,field[i])
    for j in range(len(DataList)):
        for k in range(len(field)):
            sheet1.write(j+1,k,DataList[j][k])
    wbk.save("AI職業2.xls")

def main():
    Parser_HTML()
    Save_Excel()

main()

三、使用requests & BeautifulSoup 抓取豆瓣電影Top250

網頁分析（`Python3.x`環境）

這裡寫圖片描述

import requests
from bs4 import BeautifulSoup
import chardet
import re
import xlwt
import time

def getHtml(index):                   # 獲取某頁的內容
    USER_AGENTS = [ ... ]             # 瀏覽器列表
    proxies = [{"HTTP":"117.63.78.64:6666"},
               {"HTTPS":"114.225.169.215:53128"},
               {"HTTPS":"222.185.22.108:6666"}]  # 代理IP（臨時的）
    print('正在抓取第',index+1,'頁資訊')
    url = 'https://movie.douban.com/top250?start='+str(index*25)+'&filter='
    r = requests.get(url,
                   headers={"User-Agent":random.choice(USER_AGENTS)},
                   proxies=random.choice(proxies))
    code = chardet.detect(r.content)['encoding']
    return r.content.decode(code)

reg = re.compile('.*(\d{4}).*')       #獲取年份正則
def getData(n):
    datalist = []
    for step in range(n):
        global reg
        time.sleep(0.2)
        html = getHtml(step)
        soup = BeautifulSoup(html,'html.parser')
        parent = soup.find('div',attrs={'id':'content'})    #父節點
        lis = parent.find_all('li')                         #獲取所有li

        for li in lis:
            data = []
            film_name = li.find('div',attrs={'class':'hd'}).find('span').get_text()
            data.append(film_name)              #獲取電影名稱
            film_time_str = li.find('div',attrs={'class':'bd'}).find('p').get_text()
            film_time = re.findall(reg,film_time_str)[0]
            data.append(film_time)              # 獲取上映時間
            film_score = li.find('div',attrs={'class':'star'}).\
                         find_all('span')[1].get_text()
            data.append(film_score)             # 獲取電影評分
            person_number = li.find('div',attrs={'class':'star'}).\
                            find_all('span')[3].get_text()
            number = re.findall(re.compile('\d*'),person_number)[0]
            data.append(number)                 #獲取評價人數
            # 獲取 簡評,因為有個別沒有簡評標籤，所以加判斷
            if li.find('div',attrs={'class':'bd'}).\
                        find('p',attrs={'class':'quote'}):
                evaluate = li.find('div',attrs={'class':'bd'}).\
                           find('p',attrs{'class':'quote'}).find('span').get_text()
            else:
                evaluate = ''
            data.append(evaluate)
            datalist.append(data)               #存入datalist
    return datalist

def saveToExcel(n,fileName):                    #儲存到excel
    book = xlwt.Workbook()
    sheet = book.add_sheet('豆瓣電影Top250')
    data=getData(n)
    col = ('電影名稱','上映年份','電影評分','評分人數','電影簡評')
    for k,v in enumerate(col):                  #寫入首行
        sheet.write(0, k, v)
    for i,each in enumerate(data):              #寫入電影資料
        for j,value in enumerate(each):
            sheet.write(i+1, j, value)
    book.save(fileName)

saveToExcel(10,'豆瓣.xls')                      #設定獲取頁碼，命名
print('結束')

四、使用requests&BeautifulSoup爬取西刺代理網站

分析網頁`(python 3.x環境)`

這裡寫圖片描述

import requests
from bs4 import BeautifulSoup
import re
import chardet
import random
data_dic_http=[]
data_dic_https =[]
 # 定義函式獲取天數至少大於10天的代理IP,n代表獲取兩種代理IP至少分別為n個，預設值為5
def get_IP(n=5):
    userAgent = [……]                              # 瀏覽器列表
    proxies = [{"HTTP": "117.63.78.64:6666"},
               {"HTTPS": "222.185.22.108:6666"}]  # 代理IP（臨時的）
    url = "http://www.xicidaili.com/"
    r = requests.get(url,
                     headers={"User-Agent":random.choice(USER_AGENTS)},
                     proxies=random.choice(proxies))
    code = chardet.detect(r.content)["encoding"]
    html=r.content.decode(code)
    soup=BeautifulSoup(html,"html.parser")
    parentTable=soup.find("table",attrs={"id":"ip_list"})
    trs=parentTable.find_all("tr")
    for i in range(2):                      #刪除標題（兩行）
        trs.pop(0)
    for each in trs:                        #迴圈查詢
        # data_dic_http={"http":[]}
        # data_dic_https = {"https": []}
        if  each.find_all("td"):
            tds=each.find_all("td")
            reg=re.compile("(\d+)天")
            # print(tds[6])
            days=re.findall(reg,tds[6].string)
            if days:
                if tds[5].string=="HTTPS" and int(days[0])>=10:
                    data_dic_https.append(tds[1].string+":"+tds[2].string)
                elif tds[5].string=="HTTP" and int(days[0])>=10:
                    data_dic_http.append(tds[1].string + ":" + tds[2].string)
                else:
                    continue
        else:
            continue
        if len(data_dic_http)>=n and len(data_dic_http)>=n:
            break
    return data_dic_http,data_dic_https
http_list,https_list=get_IP(10) # 獲取兩個代理IP列表，且每個列表種至少10個10天以上的IP
print(http_list)
>>> ['222.185.22.247:6666', '123.134.87.136:61234', '14.118.255.8:6666', \
    '117.67.11.136:8118', '115.28.90.79:9001', '112.115.57.20:3128', \
    '123.57.217.208:3128', '222.185.22.247:6666', '123.134.87.136:61234', '14.118.255.8:6666']

print(https_list)
>>> ['115.204.25.93:6666', '120.78.78.141:8888', '1.196.161.172:9999', \
    '121.231.32.205:6666', '122.72.18.35:80', '101.37.79.125:3128', \
    '118.212.137.135:31288', '120.76.231.27:3128', '122.72.18.34:80', \
    '115.204.25.93:6666', '1.196.161.172:9999', '121.231.32.205:6666', \
    '122.72.18.35:80', '101.37.79.125:3128', '118.212.137.135:31288', \
    '120.76.231.27:3128', '122.72.18.34:80']

四、使用BeautifulSoup爬取美女圖片

這裡寫圖片描述

import requests
import chardet
from bs4 import BeautifulSoup
import random
from urllib import request
import os
import time

def getHtml(number):         #獲取連結
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; \
        .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;\
         .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1;\
         .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; \
        .NET4.0C; .NET4.0E)"
    ]                       # 瀏覽器
    proxies = [{"HTTP":"117.63.78.64:6666"},
               {"HTTPS":"114.225.169.215:53128"},
               {"HTTPS":"222.185.22.108:6666"}]  # 代理IP
    url = "http://www.27270.com/ent/meinvtupian/list_11_"+str(number+1)+".html"
    r=requests.get(url,
                   headers={"User-Agent":random.choice(USER_AGENTS)},
                   proxies=random.choice(proxies))
    code = chardet.detect(r.content)["encoding"]
    return r.content.decode(code)

imgList=[]                  # 全域性變數（用來儲存圖片資訊）
def Get_ImgData(pageNum):   # 獲取圖片資訊
    for k in range(pageNum):
        time.sleep(2)
        html = getHtml(pageNum)
        soup = BeautifulSoup(html, "html.parser")
        Parents = soup.find("div",{'class':"MeinvTuPianBox"})
        img = Parents.find_all("img")
        for i in img:
            imgList.append(i)
        return imgList

def SaveFile(name):           #儲存檔案
    if os.path.exists(name):  #建立資料夾
        os.rmdir("photos")
    else:
        os.mkdir(name)
    os.chdir(name)
    for img in imgList:
        image_name = img['alt']
        suffix = img['src']
        print("圖片名：",image_name)
        print("圖片連結：", suffix)
        request.urlretrieve(suffix,image_name+str('.jpg'))  #強制轉成jpg
    return

def main(pageNum,name):     #主函式
    Get_ImgData(pageNum)
    SaveFile(name)

main(2,'美女_01')           #獲取頁碼，檔名

五、使用selenium爬取網易雲音樂歌單

from selenium import webdriver
import xlwt

url = 'https://music.163.com/#/discover/toplist'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
driver.switch_to.frame('contentFrame')

 # 切換視窗失敗的;先獲取frame物件，再切換
 # myframe = driver.find_elements_by_tag_name('contentFrame')
 # driver.switch_to.frame(myframe)

parents = driver.find_element_by_id("song-list-pre-cache")
table = parents.find_elements_by_tag_name("table")[0]
tbody = table.find_elements_by_tag_name("tbody")[0]
trs = tbody.find_elements_by_tag_name('tr')
SongList = []
for each in trs:
    song_Num = each.find_elements_by_tag_name("td")[0].text
    song_Name = each.find_elements_by_tag_name("td")[1].\
    find_elements_by_tag_name('b')[0].get_attribute('title')
    song_time = each.find_elements_by_tag_name("td")[2].text
    singer = each.find_elements_by_tag_name("td")[3].\
    find_elements_by_tag_name('div')[0].get_attribute('title')
    SongList.append([song_Num,song_Name,song_time,singer])
    #print(SongList)

book = xlwt.Workbook(encoding="utf-8")  # 建立工作簿
sheet = book.add_sheet("Netcloud_song")
col = ('排名', '歌名', '歌曲時長', '歌手')
for i in range(len(col)):
    sheet.write(0, i, col[i])
for i in range(len(SongList)):         # 控制行
    for j in range(len(SongList[i])):  # 控制列
        sheet.write(i + 1, j, SongList[i][j])
book.save(u'網易雲音樂.xls')

python爬蟲—練習題（re，request&BeautifulSoup,selenium）

一、使用正則獲取51job職位資訊網頁分析（python3.x環境） import re #匯入re模組 import xlwt import chardet from urllib

Python畫曲線圖（論文，報告等常用）

<pre name="code" class="python">在很多時候，例如寫論文，例如寫報告，例如做ppt，都需要花很多很多曲線圖，讓人家信服畢竟資料視覺化是人的本能。假如讀者您很不幸，像我一樣不會用matlab之類的東西畫圖或者沒辦法用matlab畫圖，那麼可以稍微關注一下p

servlet jsp傳參的理解（session，request如何使用）

servlet從jsp中得到提交的引數應用 session，request servlst中的doGet函式中寫的引數一般不變化 protected void doGet(HttpServletRequest request, HttpServletRes

Python爬蟲系列（一）：從零開始，安裝環境

tar 公司 pip nal 網頁解析目標 http caption 在上一個系列，我們學會使用rabbitmq。本來接著是把公司的celery分享出來，但是定睛一看，celery4.0已經不再支持Windows。公司也逐步放棄了服役多年的celery項目。恰好，公司找

Python爬蟲系列（四）（簡單）Dota排行榜爬取，並存入Excel表格

在編寫Python程式的時候，有很多庫供我們選擇，如urllib、requests，BeautifulSoup，lxml，正則表示式等等，使得我們在獲取網頁原始碼或者選擇元素的時候

正則表示式（re模組，匹配單個字元，匹配多個字元，匹配分組，python貪婪和非貪婪，r的作用）

re.match() 能夠匹配出以xxx開頭的字串匹配單個字元示例1： . #coding=utf-8 import re ret = re.match(".","M") print(ret.group()) ret = re.match("t.o","too") print

python爬蟲系列（3）：使用Selenium和BeautifulSoup獲取12306一個月內所有車次車票情況

首先針對標題說明一下，本次的獲取資料是指定出發地和目的地之間的車次，不是整個網站所有車次。在此操作之前，請確保自己的相關的庫都已經安裝完全，這裡可沒有教安裝庫的方法哦~~~~好的，往下走，這次的目標網頁是 https://kyfw.12306.cn/otn/leftTic

Python/ MySQL練習題（一）

姓名 insert avg when 並且記錄有效完全 cas Python/ MySQL練習題（一） 2、查詢“生物”課程比“物理”課程成績高的所有學生的學號 1 SELECT 2 * 3 FROM 4 ( 5 SELEC

Python 爬蟲 2 （轉）

規範 return python 爬蟲直接 htm str 保存 urn find 一，獲取整個頁面數據首先我們可以先獲取要下載圖片的整個頁面信息。 getjpg.py #coding=utf-8 import urllib def getHtml(url):

Python爬蟲系列（四）：Beautiful Soup解析HTML之把HTML轉成Python對象

調用 nor 結束版本現在 name屬性 data 官方文檔 get 在前幾篇文章，我們學會了如何獲取html文檔內容，就是從url下載網頁。今天開始，我們將討論如何將html轉成python對象，用python代碼對文檔進行分析。 (牛小妹在學校折騰了好幾天，也沒把h

Python爬蟲學習（一）

code time response utf path urllib quest ext .com Python訪問網頁主要使用包urllib 打開網頁使用 urllib.request.urlopen(url, data=None, [timeout, ]*, cafi

python學習——練習題（1）

二進制位機制三位數結果整除參考數字打印最大值 """ 題目：有四個數字：1、2、3、4，能組成多少個互不相同且無重復數字的三位數？各是多少？ """ import itertools def answer1(): """自己思考完成，一

python學習——練習題（2）

tle 浮點數參考 port space 100萬直接方法但是 """ 題目：企業發放的獎金根據利潤提成。利潤(I)低於或等於10萬元時，獎金可提10%；利潤高於10萬元，低於20萬元時，低於10萬元的部分按10%提成，高於10萬元的部分，可提成

python學習——練習題（4）

某年時間 ftime true 題目 error == highlight exc """ 題目：輸入某年某月某日，判斷這一天是這一年的第幾天？ """ import datetime import time from functools import reduce

python學習——練習題（8）

turn form () 開始 class 註意 {} 百分號 int """ 題目：輸出 9*9 乘法口訣表。 """ def answer1(): """ 自己用最普通的雙重循環來輸出 :return: """ print(

python學習——練習題（11）

div 習題 .com www pytho htm itl 就是兔子 """ 題目：古典問題：有一對兔子，從出生後第3個月起每個月都生一對兔子，小兔子長到第三個月後每個月又生一對兔子，假如兔子都不死，問每個月的兔子總數為多少？ 1 1 2 3 5 8 13 就是斐波

python學習——練習題（12）

fin ans 進行根據版權 set 如果分布 while """ 題目：判斷101-200之間有多少個素數，並輸出所有素數。質數（prime number）又稱素數，有無限個。質數定義為在大於1的自然數中，除了1和它本身以外不再有其他因數。 """ impo

Python爬蟲學習（1）

數據 bin des fin load 寫入 all pytho urlopen 接觸python不久，也在慕課網學習了一些python相關基礎，對於爬蟲初步認為是依靠一系列正則獲取目標內容數據於是參照著慕課網上的教學視頻，完成了我的第一個python爬蟲，雞凍 >

python基礎（集合，文件操作）

但是 upd 寫入 ren eight 無序必須分享圖片 iss 一.集合（set）集合：1.無序的，不重復的　　　2.他裏面的元素必須是可哈希的. int str bool () ，但是它本身是不可哈希的. 　　 3.集合不能更改裏面的元素　　 4.集合

Python爬蟲學習（3）

collect nbsp pri div time urlparse links ews 是否在慕課網學習並創建了一個簡單的爬蟲包，爬取百度百科相關詞條信息程序中會用到第三方解析包（BeautifulSoup4），Windows環境下安裝命令：pip install B

python爬蟲—練習題（re，request&BeautifulSoup,selenium）

一、使用 正則 獲取51job職位資訊

網頁分析（python3.x環境）

二、使用 正則 爬取智聯招聘

三、使用requests & BeautifulSoup 抓取 豆瓣電影Top250

網頁分析（Python3.x環境）

四、使用requests&BeautifulSoup爬取西刺代理網站

分析網頁(python 3.x環境)

四、使用BeautifulSoup爬取美女圖片

五、使用selenium爬取網易雲音樂歌單

相關推薦

一、使用正則獲取51job職位資訊

網頁分析（`python3.x`環境）

二、使用正則爬取智聯招聘

三、使用requests & BeautifulSoup 抓取豆瓣電影Top250

網頁分析（`Python3.x`環境）

分析網頁`(python 3.x環境)`