selenium的使用與scrapy的簡介
阿新 • • 發佈:2022-12-07
selenium的使用與scrapy的簡介
-
Xpath的使用
doc = ''' <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html' id='id_a'>Name: My image 1 <br/><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> </div> </body> </html> ''' from lxml import etree html = etree.HTML(doc) html=etree.parse('search.html',etree.HTMLParser()) 1.獲取所有節點 a = html.xpath('//*') # 根目錄下所有節點獲取 2.獲取指定結果(結果為列表) a=html.xpath('//head') # [<Element head at 0x1f7813be880>] 3.子節點,子孫節點 a = html.xpath('//div/a') a = html.xpath('//body/a') # 查詢不到body子路徑有a那麼就會直接返回一個空列表 a = html.xpath('//body//a') 4.父節點 a = html.xpath('//body//a[@href="image1.html"]/..') a = html.xpath('//body//a[1]/..') #################### a= html.xpath('//body//a[1]/parent::*') a = html.xpath('//body//a[1]/parent::div') 5.屬性匹配 a = html.xpath('body//a[@href="image1.html"]') 6.文字獲取 text() a = html.xpath('//body//a[@href="image1.html"]/text()') 7.屬性獲取 a = html.xpath('//body//a/@href') # 獲取所有屬性 a = html.xpath('//body//a/@id') 注意從1開始取(不是從0) a = html.xpath('//body//a[1]/@id') 8.屬性多值匹配 a標籤有多個class類,直接匹配就不可以了,需要用contains a = html.xpath('//body//a[@class="li"]') a = html.xpath('//body//a[@name="items"]') a = html.xpath('//body//a[contains(@class,"li")]') a = html.xpath('//body//a[contains(@class,"li")]/text()') 9.多屬性匹配 a = html.xpath('//body//a[contains(@class,"li") or @name="items"]') a = html.xpath('//body//a[contains(@class,"li") and @name="items"]/text') 10.順序查詢 a = html.xpath('//a[2]/text()') a = html.xpath('//a[3]/@href') 取最後一個 a = html.xpath('//a[last()]/@href') 位置小於3的 a = html.xpath('//a[position()<3]/@href') 倒數第二個 a = html.xpath('//a[last()-2]/@href') 11.節點軸選擇 ancestor # 祖先節點 使用了*獲取所有祖先節點 a = html.xpath('//a/ancestor::*') 獲取祖先節點中的div a = html.xpath('//a/ancestor::div') attribute 屬性值 a = html.xpath('//a[1]/attribute::*') a = html.xpath('//a[1]/attribute::href') child # 直接子節點 a = html.xpath('//a[1]/child::*') descendant 所有子孫節點 a = html.xpath('//a[6]/descendant::*') following 當前節點之後所有節點 a = html.xpath('//a[1]/following::*') a = html.xpath('//a[1]/following::*[1]/@href') following-sibling 當前節點之後同級別節點 a = html.xpath('//a[1]/following-sibling::*') a =html.xpath('//a[1]/following-sibling::a') a =html.xpath('//a[1]/following-sibling::*[2]') a =html.xpath('//a[1]/following-sibling::*[2]/@href') print(a)
- selenium動作鏈
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver import ActionChains import time bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.implicitly_wait(10) try: bro.switch_to.frame('iframeResult') sourse = bro.find_element(by=By.ID, value='draggable') target = bro.find_element(by=By.ID, value='droppable') # 方式一:基於同一個動作鏈序列執行 # actions = ActionChains(bro) # 拿到動作鏈序列物件 # actions.drag_and_drop(sourse,target) # 將動作放到動作鏈中序列執行 # actions.perform() # 方式二:不同的動作鏈,每次移動的位移都不同 ActionChains(bro).click_and_hold(sourse).perform() distance = target.location['x'] - sourse.location['x'] print('目標距離源的軸距離:', distance) trace = 0 while trace < distance: ActionChains(bro).move_by_offset(xoffset=4, yoffset=0).perform() trace += 4 ActionChains(bro).release().perform() time.sleep(10) except Exception as e: print(e) finally: bro.close()
- 自動登入某網站
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver import ActionChains import time from selenium.webdriver.chrome.options import Options options = Options() options.add_argument("--disable-blink-features=AutomationControlled") # 去掉自動化控制的提示 bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=options) bro.get('https://kyfw.12306.cn/otn/resources/login.html') bro.implicitly_wait(10) try: username = bro.find_element(by=By.ID,value='J-userName') username.send_keys('') password = bro.find_element(by=By.ID,value='J-password') password.send_keys('') time.sleep(3) btn = bro.find_element(by=By.ID,value='J-login') btn.click() span = bro.find_element(by=By.ID,value='nc_1_n1z') ActionChains(bro).click_and_hold(span).perform() ActionChains(bro).move_by_offset(xoffset=300,yoffset=0).perform() time.sleep(5) except Exception as e: print(e) finally: bro.close()
-
打碼平臺的使用
打碼平臺其實就是我們將驗證碼圖片發往第三方,然後第三方幫我們解決,我們只需要用錢就可以了,簡單的驗證碼花錢少一些,複雜的驗證碼花錢較多例如steam的就需要很多錢
-
打碼平臺自動登入
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
from PIL import Image
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('http://www.chaojiying.com/user/login/')
bro.implicitly_wait(10)
bro.maximize_window()
try:
username = bro.find_element(by=By.XPATH,value='/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input')
password = bro.find_element(by=By.XPATH,value='/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input')
code = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input')
btn = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input')
username.send_keys('306334678')
password.send_keys('lqz123')
bro.save_screenshot('main.png')
img = bro.find_element(By.XPATH,'/html/body/div[3]/div/div[3]/div[1]/form/div/img')
location = img.location
size = img.size
print(location)
print(size)
img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
img = Image.open('./main.png')
fram = img.crop(img_tu)
fram.save('code.png')
chaojiying = Chaojiying_Client('','','')
im = open('code.png','rb').read()
print(chaojiying.PostPic(im,1902))
res_code = chaojiying.PostPic(im,1902)['pic_str']
code.send_keys(res_code)
time.sleep(5)
btn.click()
time.sleep(5)
except Exception as e:
print(e)
finally:
bro.close()
- 爬取某東商品
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.keys import Keys
def get_goods(driver):
try:
goods = driver.find_elements(by=By.CLASS_NAME, value='gl-item')
for good in goods:
name = good.find_element(by=By.CSS_SELECTOR, value='.p-name em').text
price = good.find_element(by=By.CSS_SELECTOR, value='.p-price i').text
commit = good.find_element(by=By.CSS_SELECTOR, value='.p-commit a').text
url = good.find_element(by=By.CSS_SELECTOR, value='.p-name a').get_attribute('href')
img = good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute('src')
if not img:
img = 'https://' + good.find_element(by=By.CSS_SELECTOR, value='.p-img img').get_attribute(
'data-lazy-img')
print('''
商品名字:%s
商品價格:%s
商品連結:%s
商品圖片:%s
商品評論:%s
''' % (name, price, url, img, commit))
button = driver.find_elements(by=By.PARTIAL_LINK_TEXT,value='下一頁')
button.click()
time.sleep(1)
get_goods(driver)
except Exception as e:
print(e)
def spider(url,keyword):
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get(url)
driver.implicitly_wait(10)
try:
input_tag = driver.find_element(by=By.ID,value='key')
input_tag.send_keys(keyword)
input_tag.send_keys(Keys.ENTER)
get_goods(driver)
finally:
driver.close()
if __name__ == '__main__':
spider('https://www.jd.com/',keyword='精品內衣')
-
scrapy介紹
# 前面學的都是模組,做專業的爬蟲,可以使用框架 (django:web) scrapy:爬蟲框架
-做爬蟲用的東西,都封裝好了,只需要在固定的位置寫固定的程式碼即可
# scrapy 號稱爬蟲界的djagno
-django 大而全,做web相關的它都用
-scrapy 大而全,做爬蟲的,它都用
# 介紹
Scrapy一個開源和協作的框架,其最初是為了頁面抓取 (更確切來說, 網路抓取 )所設計的,使用它可以以快速、簡單、可擴充套件的方式從網站中提取所需的資料。但目前Scrapy的用途十分廣泛,可用於如資料探勘、監測和自動化測試等領域,也可以應用在獲取API所返回的資料或者通用的網路爬蟲
# 安裝 scrapy
-mac,linux:
pip3 install scrapy
-win:看人品
-pip3 install scrapy
-人品不好:
1、pip3 install wheel #安裝後,便支援通過wheel檔案安裝軟體 xx.whl
3、pip3 install lxml
4、pip3 install pyopenssl
5、下載並安裝pywin32:https://sourceforge.net/projects/pywin32/files/pywin32/
6、下載twisted的wheel檔案:http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
7、執行pip3 install 下載目錄\Twisted-17.9.0-cp36-cp36m-win_amd64.whl
8、pip3 install scrapy
# 釋放出scrapy 可執行檔案
-以後使用這個建立爬蟲專案 ---》django-admin建立django專案
# 建立爬蟲專案
scrapy startproject myfirstscrapy
# 建立爬蟲 [django建立app]
scrapy genspider cnblogs www.cnblogs.com
# 啟動爬蟲
scrapy crawl cnblogs --nolog
# pycharm中執行
新建run.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'cnblogs','--nolog'])