python 基礎 網路爬蟲 day03
阿新 • • 發佈:2018-12-17
目錄
day02
1.關於正則解析
- 分組(想要抓取什麼內容就要加小括號())
- 正則方法
p = re.compile('...')
r_list = p.findall(html)
結果 :[(),(),(),()] - 貪婪匹配 : .*
- 非貪婪匹配 :.*?
2.抓取步驟
- 找URL
- 寫正則表示式
- 定義類,寫程式框架
- 補全程式碼
3.存入csv檔案
- Import csv
with open('xxx.csv','a',newline="",encoding="") as f:
writer = csv.writer(f)
writer.writerow([...,...,...])
4.Fiddler常用選單
- Inspector:請求,響應兩部分
- 常用選項
- Headers
- WebForms
- Raw:請求 --純文字
5.cookie 和 session
- cookie:客戶端
- session:web伺服器端
6.請求方式
- GET
- POST
- Cookie模擬登陸
- 先登入成功1次,利用抓包工具抓取到Cookie
- 將Request Header(包含cookie),處理為字典,作為引數發請求
7.安裝模組
- Aanconda Prompt : conda install 模組名
- Windows cmd:python -m pip install 模組名
8.requests模組
- get(url,params=params,headers=headers)
params:查詢引數,字典,不用編碼,不用URL拼接 - post(url,data=data,headers=headers)
data:Form表單資料,字典,不用編碼,不用轉碼 - 響應物件屬性
- encoding:響應字元編碼,res.encoding='utf-8'
- text:字串
- content:位元組流
- status_code:響應碼
- url:返回實際資料的URL
- 非結構化資料儲存
html = res.content
with open("XXX","wb") as f:
f.write(html)
day03
1.requests模組
- 代理(引數名:proxies)
- 獲取代理ip的網站
西刺代理網站
快代理
全網代理 - 普通代理
- proxies = {'協議':'協議://IP地址:埠號'}
proxies = {'http':'http://203.86.26.9:3128'}'''01_普通代理示例.py''' import requests url = "http://www.baidu.com/" proxies = {"http":"http://183.129.207.82:11597"} headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,proxies=proxies,headers=headers) print(res.status_code)
- proxies = {'協議':'協議://IP地址:埠號'}
- 私密代理
proxies = {"http":"http://309435365:[email protected]:16817"}'''02_私密代理示例.py''' import requests url = "http://httpbin.org/get" headers = {"User-Agent":"Mozilla/5.0"} proxies = {"http":"http://309435365:[email protected]:16817"} res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.text)
pymysql 和 pymongo回顧示例:
'''建立一個mysql庫spiderdb,建立表t1,插入1條記錄''' import pymysql import warnings # 建立資料庫連線物件 db = pymysql.connect("localhost","root", "123456",charset="utf8") # 建立遊標物件 cursor = db.cursor() # 執行語句 # 過濾警告 warnings.filterwarnings("ignore") try: cursor.execute("create database if not exists spiderdb") cursor.execute("use spiderdb") cursor.execute("create table if not exists t1(id int)") except Warning: pass ins = "insert into t1 values(%s)" cursor.execute(ins,[1]) cursor.execute(ins,[2]) # 提交 db.commit() # 關閉 cursor.close() db.close() ---------------------------------------------------------------------------------------- '''04_pymongo回顧.py''' import pymongo # 建立連線物件 conn = pymongo.MongoClient("localhost",27017) # 建立資料庫物件,spiderdb為庫的名字 db = conn.spiderdb # 利用資料庫物件建立集合物件 myset = db.t1 # 執行插入 myset.insert({"name":"Tom"}) show dbs use spiderdb show tables db.t1.find().pretty()
- 獲取代理ip的網站
- 案例1:爬取鏈家二手房資訊 --> 存到MySQL資料庫中
- 正則<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S
- 寫程式碼
mongo資料庫插入:'''05_鏈家資料ToMongo.py''' import requests import re import pymongo class LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent":"Mozilla/5.0"} self.proxies = {"http":"http://309435365:[email protected]:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Lianjia self.myset = self.db.housePrice def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("頁面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S) r_list = p.findall(html) # [("天通苑","480","萬"),()..] print("頁面解析完成,正在存入資料庫...") self.writeTomongo(r_list) def writeTomongo(self,r_list): for r_tuple in r_list: D = {"houseName":r_tuple[0].strip(),\ "totalPrice":float(r_tuple[1].strip())*10000} self.myset.insert(D) print("存入資料庫成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = LianjiaSpider() spider.workOn()
mysql資料庫插入:'''05_鏈家資料ToMongo.py''' import requests import re import pymysql import warnings class LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent":"Mozilla/5.0"} self.proxies = {"http":"http://309435365:[email protected]:16817"} self.db = pymysql.connect("localhost", "root","123456",charset="utf8") self.cursor = self.db.cursor() def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("頁面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S) r_list = p.findall(html) # [("天通苑","480","萬"),()..] print("頁面解析完成,正在存入資料庫...") self.writeTomysql(r_list) def writeTomysql(self,r_list): c_db = "create database if not exists Lianjiadb \ character set utf8" u_db = "use Lianjiadb" c_tab = "create table if not exists housePrice( \ id int primary key auto_increment,\ housename varchar(50), \ totalprice int)charset=utf8" warnings.filterwarnings("ignore") try: self.cursor.execute(c_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except Warning: pass ins = "insert into housePrice(housename,totalprice) \ values(%s,%s)" for r_tuple in r_list: name = r_tuple[0].strip() price = float(r_tuple[1].strip())*10000 L = [name,price] self.cursor.execute(ins,L) self.db.commit() print("存入資料庫成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: self.cursor.close() self.db.close() print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = LianjiaSpider() spider.workOn()
趁熱打鐵day02貓眼電影存入mysql'''06_貓眼電影top100抓取.py''' import requests import re import pymysql import warnings class MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:[email protected]:16817"} self.db = pymysql.connect("localhost","root","123456","Lianjiadb",charset="utf8") self.cursor = self.db.cursor() # 下載頁面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析頁面 def parsePage(self,html): p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # print(r_list) # [("霸王別姬","張國榮","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): c_tab = "create table if not exists top100( \ id int primary key auto_increment,\ name varchar(50),\ star varchar(100),\ releasetime varchar(50)\ )charset=utf8" ins = "insert into top100(name,star,releasetime) \ values(%s,%s,%s)" # 過濾警告 warnings.filterwarnings("ignore") try: self.cursor.execute(c_tab) except Warning: pass for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() L = [name,star,releasetime] self.cursor.execute(ins,L) self.db.commit() print("存入資料庫成功") def workOn(self): while True: c = input("爬取請按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn()
存入mongo:
'''06_貓眼電影top100抓取.py''' import requests import re import pymongo class MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:[email protected]:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Film self.myset = self.db.top100 # 下載頁面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析頁面 def parsePage(self,html): p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # print(r_list) # [("霸王別姬","張國榮","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() D = {"name":name, "star":star, "releasetime":releasetime} self.myset.insert(D) print("存入資料庫成功") def workOn(self): while True: c = input("爬取請按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn()
- Web客戶端驗證(引數名:auth)
- auth=('使用者名稱','密碼')
auth=('tarenacode','code_2013') - 案例:
'''09_Web客戶端驗證.py''' import requests import re class NoteSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/5.0"} self.url = "http://code.tarena.com.cn/" self.proxies = {"http":"http://309435365:[email protected]:16817"} # auth引數儲存使用者名稱和密碼(必須為元組) self.auth = ("tarenacode","code_2013") def getParsePage(self): res = requests.get(self.url, proxies=self.proxies, headers=self.headers, auth=self.auth, timeout=3) res.encoding = "utf-8" html = res.text # print(html) p = re.compile('<a href=".*?>(.*?)</a>',re.S) r_list = p.findall(html) # print(r_list) self.writePage(r_list) def writePage(self,r_list): print("開始寫入檔案...") with open("達內科技.txt","a") as f: for r_str in r_list: f.write(r_str + "\n\n") print("寫入成功") if __name__ == "__main__": spider = NoteSpider() spider.getParsePage()
- auth=('使用者名稱','密碼')
- SSL證書認證(引數名:verify)
- verify = True:預設,進行SSL證書認證
- verify = False:不做認證
'''10_SSL證書認證示例.py''' import requests url = "https://www.12306.cn/mormhweb/" headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,headers=headers,verify=False) res.encoding = "utf-8" print(res.text)
2.urllib.request 中Handler處理器
- 定義
自定義的urlopen()方法,urlopen()方法是一個特殊的opener(模組已定義好),不支援代理等功能,通過Handler處理器物件來自定義opener物件 - 常用方法
- build_opener(Handler處理器物件):建立opener物件
- opener.open(url,引數)
- 使用流程
- 建立相關的Handler處理器物件
http_handler = urllib.request.HTTPHandler() - 建立自定義opener物件
opener = urllib.request.build_opener(http_handler) - 利用opener物件開啟url
req = urllib.request.Request(url,headers=headers)
res = opener.open(req)'''Handler處理器示例.py''' import urllib.request url = "http://www.baidu.com/" # 建立Handler處理器物件 http_handler = urllib.request.HTTPHandler() #proxy_handler = urllib.request.ProxyHandler() # 建立自定義的opener物件 opener = urllib.request.build_opener(http_handler) # 利用opener物件的open()方法發請求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- 建立相關的Handler處理器物件
- Handler處理器分類
- HTTPHandler():沒有任何特殊功能
- ProxyHandler(普通代理)
代理:{'協議':'IP地址:埠號'}'''12_ProxyHandler示例.py''' import urllib.request url = "http://www.baidu.com/" proxy = {"http":"127.0.0.1:8888"} # 建立Handler處理器物件 pro_hand = urllib.request.ProxyHandler(proxy) # 建立自定義opener物件 opener = urllib.request.build_opener(pro_hand) # opener物件open方法發請求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyBasicAuthHandler(密碼管理器物件):私密代理
- HTTPBasicAuthHandler(密碼管理器物件):web客戶端認證
- 密碼管理器物件作用
- 私密代理
- Web客戶端認證
- 程式實現流程
- 建立密碼管理器物件
pwdmg = urllib.request.HTTPPasswordMgrWithDefaultRealm() - 把認證資訊新增到密碼管理器
pwdmg.add_password(None,webserver,user,passwd) - 建立Handler處理器物件
- 私密代理
- proxy = urllib.request.ProxyBasicAuthHandler(pwdmg)
- Web客戶端
webbasic = urllib.request.HTTPBasicAuthHandler(pwdmg)
- 私密代理
- 建立密碼管理器物件
安裝:
- Windows :安裝selenium
Anaconda Prompt下執行 : python -m pip install selenium - Ubuntu :安裝Scrapy框架
#### 依賴庫較多,以下為全部依賴庫,有些已安裝 ####- sudo apt-get install libssl-dev
sudo apt-get install libffi-dev
sudo apt-get install python3-dev
sudo apt-get install build-essential
sudo apt-get install libxml2
sudo apt-get install libxml2-dev
sudo apt-get install libxslt1-dev
sudo apt-get install zlib1g-dev
- sudo pip3 install Scrapy
- sudo apt-get install libssl-dev
今日程式碼