爬蟲例項
阿新 • • 發佈:2020-08-15
目錄
爬拉鉤網資訊
#https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false import requests #實際要爬取的url url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' payload = { 'first': 'true', 'pn': '1', 'kd': 'python', } header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Accept': 'application/json, text/javascript, */*; q=0.01' } #原始的url urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' #建立session s = requests.Session() # 獲取搜尋頁的cookies s.get(urls, headers=header, timeout=3) # 為此次獲取的cookies cookie = s.cookies # 獲取此次文字 response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text print(response)
爬紅樓夢小說
#http://www.shicimingju.com/book/hongloumeng.html import requests from bs4 import BeautifulSoup ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html') # print(ret.text) soup=BeautifulSoup(ret.text,'lxml') li_list=soup.find(class_='book-mulu').find('ul').find_all('li') with open('hlm.txt','w',encoding='utf-8') as f: for li in li_list: content=li.find('a').text url='https://www.shicimingju.com'+li.find('a').get('href') f.write(content) f.write('\n') res_content=requests.get(url) soup2=BeautifulSoup(res_content.text,'lxml') content_detail=soup2.find(class_='chapter_content').text f.write(content_detail) f.write('\n') print(content,'寫入了')
爬肯德基門店資訊
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword import requests header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' } data = { 'cname': '', 'pid': 20, 'keyword': '浦東', 'pageIndex': 1, 'pageSize': 10 } ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header) print(ret.json())
爬嗅事百科段子
#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)
soup=BeautifulSoup(ret.text,'html.parser')
article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
content=article.find(class_='content').text
print(content)
print('-------')
xpath選擇器使用
# xpath: XPath 是一門在 XML 文件中查詢資訊的語言
# / :從根節點選取。只限根節點
# // :不管位置,直接找所有
# /@屬性名
# /text()
# 會複製()
doc='''
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html' aa='bb'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
<a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
</div>
</body>
</html>
'''
from lxml import etree
html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有節點
# a=html.xpath('//*')
# 2 指定節點(結果為列表)
# a=html.xpath('//head')
# 3 子節點,子孫節點
# a=html.xpath('//div/a')
# a=html.xpath('//body/a') #無資料
# a=html.xpath('//body//a')
# 4 父節點
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a[1]/..')
# 也可以這樣
# a=html.xpath('//body//a[1]/parent::*')
# 5 屬性匹配
# a=html.xpath('//body//a[@href="image1.html"]')
# 6 文字獲取(重要) /text() 取當前標籤的文字
# a=html.xpath('//body//a[@href="image1.html"]/text()')
# a=html.xpath('//body//a/text()')
# 7 屬性獲取 @href 取當前標籤的屬性
# a=html.xpath('//body//a/@href')
# # 注意從1 開始取(不是從0)
# a=html.xpath('//body//a[1]/@href')
# 8 屬性多值匹配
# a 標籤有多個class類,直接匹配就不可以了,需要用contains
# a=html.xpath('//body//a[@class="li"]')
# a=html.xpath('//body//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多屬性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序選擇
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# 取最後一個
# a=html.xpath('//a[last()]/@href')
# 位置小於3的
# a=html.xpath('//a[position()<3]/@href')
# 倒數第二個
# a=html.xpath('//a[last()-2]/@href')
# 11 節點軸選擇
# ancestor:祖先節點
# 使用了* 獲取所有祖先節點
# a=html.xpath('//a/ancestor::*')
# # 獲取祖先節點中的div
# a=html.xpath('//a/ancestor::div')
# attribute:屬性值
# a=html.xpath('//a[1]/attribute::*')
# a=html.xpath('//a[1]/@aa')
# child:直接子節點
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant:所有子孫節點
# a=html.xpath('//a[6]/descendant::*')
# a=html.xpath('//a[6]/descendant::h5/text()')
# following:當前節點之後所有節點(兄弟節點和兄弟內部的節點)
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:當前節點之後同級節點(只找兄弟)
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')
print(a)
# /
# //
# /@屬性名
# /text()
//以後去查詢標籤,bs4的find, css,xpath(通用的)
selenium使用
# 為了解決requests無法直接執行JavaScript程式碼的問題
#
# pip3 install selenium
# 瀏覽器驅動:http://npm.taobao.org/mirrors/chromedriver/
# 驅動要跟瀏覽器版本對應 84.0.4147.105:驅動用84.0.4147.30/向上相容
# 下載完解壓就是個exe(不同平臺的可執行檔案)
# from selenium import webdriver
# import time
# # bro=webdriver.Chrome() # 得到一個谷歌瀏覽器物件,
# # 指定使用跟那個驅動
# bro=webdriver.Chrome(executable_path='./chromedriver.exe') # 得到一個谷歌瀏覽器物件,
#
# time.sleep(2)
# bro.get('https://www.baidu.com/') # 在位址列裡輸入了百度
# time.sleep(2)
# print(bro.page_source)
# time.sleep(2)
# bro.close()
# 模擬登陸百度
# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
#
# bro.get('https://www.baidu.com/')
# time.sleep(0.01)
# input_k=bro.find_element_by_id('kw')
# input_k.send_keys('美女') # 在框裡寫入美女
# time.sleep(2)
# sou=bro.find_element_by_id('su') # 找到搜尋按鈕
# sou.click() # 點選搜尋按鈕
# time.sleep(4)
# bro.close()
# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.implicitly_wait(5) # 隱士等待:找一個控制元件,如果控制元件沒有加載出來,等待5s中 等待所有,只需要寫著一句,以後找所有控制元件都按這個操作來
# bro.get('https://www.baidu.com/')
#
# d_button=bro.find_element_by_link_text('登入')
#
# d_button.click()
#
# login_u=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
# login_u.click()
#
# username=bro.find_element_by_id('TANGRAM__PSP_11__userName')
# username.send_keys('yxp654799481')
# password=bro.find_element_by_id('TANGRAM__PSP_11__password')
# password.send_keys('yxp997997')
# time.sleep(3)
# submit=bro.find_element_by_id('TANGRAM__PSP_11__submit')
#
# submit.click()
# time.sleep(10)
#
# print(bro.get_cookies())
#
# bro.close()
# ##############選擇器(find系列)
# ===============所有方法===================
# 1、find_element_by_id # 通過id查詢控制元件
# 2、find_element_by_link_text # 通過a標籤內容找
# 3、find_element_by_partial_link_text # 通過a標籤內容找,模糊匹配
# 4、find_element_by_tag_name # 標籤名
# 5、find_element_by_class_name # 類名
# 6、find_element_by_name # name屬性
# 7、find_element_by_css_selector # 通過css選擇器
# 8、find_element_by_xpath # 通過xpaht選擇器
# 強調:
# 1、find_elements_by_xxx的形式是查詢到多個元素,結果為列表
# 獲取元素屬性
# 重點
# tag.get_attribute('href') # 找當前控制元件 的href屬性對的值
# tag.text # 獲取文字內容
# 瞭解
# print(tag.id) # 當前控制元件id號
# print(tag.location) # 當前控制元件在頁面位置
# print(tag.tag_name) # 標籤名
# print(tag.size) #標籤的大小
####無介面瀏覽器(phantomjs)
#谷歌瀏覽器支援不開啟頁面
# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
# chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定瀏覽器解析度
# chrome_options.add_argument('--disable-gpu') #谷歌文件提到需要加上這個屬性來規避bug
# chrome_options.add_argument('--hide-scrollbars') #隱藏滾動條, 應對一些特殊頁面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不載入圖片, 提升速度
#
#
# chrome_options.add_argument('--headless') #瀏覽器不提供視覺化頁面. linux下如果系統不支援視覺化不加這條會啟動失敗
#
#
# bro=webdriver.Chrome(chrome_options=chrome_options,executable_path='./chromedriver.exe')
# bro.get('https://www.baidu.com/')
# print(bro.page_source)
# bro.close()
######元素互動
# tag.send_keys() # 往裡面寫內容
# tag.click() # 點選控制元件
# tag.clear() # 清空控制元件內容
#####執行js(有什麼用?)
# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.implicitly_wait(5) # 隱士等待:找一個控制元件,如果控制元件沒有加載出來,等待5s中 等待所有,只需要寫著一句,以後找所有控制元件都按這個操作來
# bro.get('https://www.baidu.com/')
#
#
# bro.execute_script('window.open()')
# bro.execute_script('window.open()')
# time.sleep(2)
# bro.close()
####模擬瀏覽器前進後退
# from selenium import webdriver
# import time
# browser=webdriver.Chrome(executable_path='./chromedriver.exe')
# browser.get('https://www.baidu.com')
# browser.get('https://www.taobao.com')
# browser.get('http://www.sina.com.cn/')
#
# browser.back()
# time.sleep(1)
# browser.forward()
#
# browser.close()
#####獲取cookie
# bro.get_cookies()
#### 選項卡管理(瞭解)
# from selenium import webdriver
# import time
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')
#
# print(browser.window_handles) #獲取所有的選項卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get('https://www.taobao.com')
# time.sleep(2)
# browser.switch_to_window(browser.window_handles[0])
# browser.get('https://www.sina.com.cn')
# browser.close()
##### 異常處理
# from selenium import webdriver
# from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
# browser=webdriver.Chrome()
# try:
#
# browser.get('')
# except Exception as e:
# print(e)
# finally:
# # 無論是否出異常,最終都要關掉
# browser.close()
#####動作鏈()
#### 如何把螢幕拉倒最後(js控制)
# bro.execute_script('window.scrollTo(0,document.body.offsetHeight)')
爬取京東商品資訊
from selenium import webdriver
import time
# 模擬鍵盤輸入
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# 設定隱士等待
bro.implicitly_wait(10)
def get_goods_info(bro):
# li_list=bro.find_element_by_class_name('gl-warp').find_elements_by_tag_name('li')
# goods=bro.find_elements_by_class_name('gl-item')
goods = bro.find_elements_by_css_selector('.gl-item')
# print(len(goods))
for good in goods:
try:
price = good.find_element_by_css_selector('.p-price i').text
name = good.find_element_by_css_selector('.p-name em').text
url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
commits = good.find_element_by_css_selector('.p-commit strong>a').text
photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')
print('''
商品名字:%s
商品價格:%s
商品地址:%s
商品評論數:%s
商品圖片地址:%s
''' % (name, price, url, commits, photo_url))
except Exception as e:
continue
next_button = bro.find_element_by_partial_link_text('下一頁')
time.sleep(1)
next_button.click()
get_goods_info(bro)
try:
bro.get('https://www.jd.com/')
input_k=bro.find_element_by_id('key')
input_k.send_keys('奶牛')
# 模擬鍵盤的回車鍵
input_k.send_keys(Keys.ENTER)
get_goods_info(bro)
except Exception as e:
print(e)
finally:
bro.close()
自動登入12306
from selenium import webdriver
import time
#pillow
from PIL import Image
# 引入超級鷹
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
try:
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
bro.maximize_window() # 視窗最大化,全屏
button_z=bro.find_element_by_css_selector('.login-hd-account a')
button_z.click()
time.sleep(2)
# 擷取整個螢幕
bro.save_screenshot('./main.png')
# 驗證碼的位置和大小
img_t=bro.find_element_by_id('J-loginImg')
print(img_t.size)
print(img_t.location)
size=img_t.size
location=img_t.location
img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
# # 摳出驗證碼
# #開啟
img = Image.open('./main.png')
# 摳圖
fram = img.crop(img_tu)
# 截出來的小圖
fram.save('code.png')
# 呼叫超級鷹破解
chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641') #使用者中心>>軟體ID 生成一個替換 96001
im = open('code.png', 'rb').read() #本地圖片檔案路徑 來替換 a.jpg 有時WIN系統須要//
# print(chaojiying.PostPic(im, 9004))
## 返回結果如果有多個 260,133|123,233,處理這種格式[[260,133],[123,233]]
res=chaojiying.PostPic(im, 9004)
print(res)
result=res['pic_str']
all_list = []
if '|' in result:
list_1 = result.split('|')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x = int(result.split(',')[0])
y = int(result.split(',')[1])
xy_list = []
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
print(all_list)
# 用動作鏈,點選圖片
# [[260,133],[123,233]]
for a in all_list:
x = a[0]
y = a[1]
ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform()
time.sleep(1)
username=bro.find_element_by_id('J-userName')
username.send_keys('306334678')
password=bro.find_element_by_id('J-password')
password.send_keys('lqz12345')
time.sleep(3)
submit_login=bro.find_element_by_id('J-login')
submit_login.click()
time.sleep(3)
print(bro.get_cookies())
time.sleep(10)
bro.get('https://www.12306.cn/index/')
time.sleep(5)
except Exception as e:
print(e)
finally:
bro.close()
cookies池講解
# 如何搭建cookie池
# selenium寫一套(一堆小號),跑起指令碼,自動登入,手動參與
# 拿到cookie,放到redis中
# django搭建一個服務:127.0.0.0/get,隨機返回一個cookie
# request傳送請求爬資料(selenium拿到的cookie),cookie失效
抓包工具介紹
# 1 瀏覽器除錯模式
# 2 fiddler,charles(自己研究一下)