二、爬蟲輔助工具 selenium
阿新 • • 發佈:2018-12-31
一、應用場景
- 頁面點選拖動,載入內容
- 元素拖動,驗證碼破解
- 執行js,資料獲取
- 模擬登入
- 引數無法獲取
二、準備
- 瀏覽器:chrome
- 驅動包:瀏覽器版本、http://chromedriver.storage.googleapis.com/index.html 下載對應的驅動
- Java對應webdriver 依賴的jar包
三、專案
(一)新增依賴
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.141.59</version>
</dependency>
(二)模擬百度登入
package org.pc.demo;
import org.openqa.selenium.By;
import org. openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.Scanner;
import java.util.Set;
/**
* @author 鹹魚
* @date 2018/12/21 21:00
*/
public class SeleniumTest {
private static Set<Cookie> cookies;
public static void main(String[] args) {
login();
}
public static void login() {
//驅動位置
System.getProperties().setProperty("webdriver.chrome.driver", "E:\\demo\\crawler\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
//瀏覽器位置
options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
WebDriver driver = new ChromeDriver(options);
//開啟百度登入頁面
driver.get("https://passport.baidu.com/v2/?login&tpl=mn");
//切換使用者名稱密碼登入
driver.findElement(By.id("TANGRAM__PSP_3__footerULoginBtn")).click();
//輸入賬號
driver.findElement(By.id("TANGRAM__PSP_3__userName")).clear();
driver.findElement(By.id("TANGRAM__PSP_3__userName")).sendKeys("***");
//輸入密碼
driver.findElement(By.id("TANGRAM__PSP_3__password")).clear();
driver.findElement(By.id("TANGRAM__PSP_3__password")).sendKeys("***");
//人工輸入驗證碼
Scanner sc = new Scanner(System.in);
String s = sc.nextLine();
driver.findElement(By.id("TANGRAM__PSP_3__verifyCode")).clear();
driver.findElement(By.id("TANGRAM__PSP_3__verifyCode")).sendKeys(s);
driver.findElement(By.id("TANGRAM__PSP_3__submit")).click();
//獲取cookie資訊*/
cookies = driver.manage().getCookies();
for (Cookie cookie : cookies) {
System.out.println(cookie.getName() + ":" + cookie.getValue());
}
//後續把cookie 新增進header
driver.quit();
}
}
(三)用selenium模擬登入,破解滑動驗證碼
package org.pc.demo;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import javax.imageio.ImageIO;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Point;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.interactions.Actions;
import sun.misc.BASE64Decoder;
/**
* 新版本極驗官網測試舊版滑塊演算法(失敗)
* 有需求可對接打碼平臺
*/
public class JiYan {
public static void main(String[] args) throws Exception {
run1();
}
public static void decodeBase64ToImage(String base64, String imgPath) {
BASE64Decoder decoder = new BASE64Decoder();
try {
File file = new File(imgPath);
if (file.exists()) {
file.delete();
}
FileOutputStream write = new FileOutputStream(new File(imgPath));
byte[] decoderBytes = decoder.decodeBuffer(base64.replace("data:image/png;base64,", ""));
write.write(decoderBytes);
write.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void doGet(String url) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = null;
HttpEntity entity = null;
String s = "";
try {
httpGet.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36");
response = httpClient.execute(httpGet);
entity = response.getEntity();
System.out.println("doGet" + response.getStatusLine().getStatusCode());
s = EntityUtils.toString(entity, "UTF-8");
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
httpClient.close();
}
}
// 判斷畫素是否相差過大
private static boolean isSimilar(BufferedImage image1, BufferedImage image2, int x, int y) {
int pixel1 = image1.getRGB(x, y);
int pixel2 = image2.getRGB(x, y);
int[] rgb1 = getRGB(pixel1);
int[] rgb2 = getRGB(pixel2);
for (int i = 0; i < 3; i++) {
if (Math.abs(rgb1[i] - rgb2[i]) > 80) {
return false;
}
}
return true;
}
// 返回RGB陣列
private static int[] getRGB(int pixel) {
int[] rgb = new int[3];
rgb[0] = (pixel & 0xff0000) >> 16;
rgb[1] = (pixel & 0xff00) >> 8;
rgb[2] = (pixel & 0xff);
return rgb;
}
// 計算移動距離
private static int getDiffLocation(BufferedImage image1, BufferedImage image2) {
int i = 0;
for (int x = 0; x < 260; x++) {
for (int y = 0; y < 116; y++) {
if (isSimilar(image1, image2, x, y) == false) {
return x;
}
}
}
return i;
}
public static void run1() throws Exception {
//設定chrome驅動路徑
System.getProperties().setProperty("webdriver.chrome.driver", "E:\\demo\\crawler\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
//設定chrome瀏覽器位置
options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
WebDriver driver = new ChromeDriver(options);
driver.get("http://www.geetest.com/type/");
//暴力延遲
Thread.sleep(1000);
driver.findElement(By.xpath("//div[@class='products-content']/ul/li[3]")).click();
Thread.sleep(1000);
driver.findElement(By.xpath("//div[@class='products-content']/ul/li[2]")).click();
Thread.sleep(1000);
driver.findElement(By.xpath("//div[@class='geetest_radar_btn']/div[@class='geetest_radar_tip']/span")).click();
System.out.println(driver.findElement(By.xpath("//div[@class='geetest_radar_tip']/span")).getText());
Thread.sleep(1000);
//執行js 獲取驗證碼圖片 畫布轉base64
JavascriptExecutor js = (JavascriptExecutor) driver;
String jsstr = "var oCanvas =document.getElementsByClassName('geetest_canvas_fullbg geetest_fade geetest_absolute')[0];" +
"return oCanvas.toDataURL();";
String o = (String) js.executeScript(jsstr);
decodeBase64ToImage(o, "E:\\a1.png");
String jsstr1 = "var oCanvas =document.getElementsByClassName('geetest_canvas_bg geetest_absolute')[0];" +
"return oCanvas.toDataURL();";
String o1 = (String) js.executeScript(jsstr1);
decodeBase64ToImage(o1, "E:\\a2.png");
// 獲取移動距離
Random random = new Random();
BufferedImage image1 = ImageIO.read(new FileInputStream("E:/a1.png"));
BufferedImage image2 = ImageIO.read(new FileInputStream("E:/a2.png"));
int ranAddLoc = -2;
int loc = (getDiffLocation(image1, image2) - 5) + ranAddLoc;
System.out.println(loc);
//移動演算法
List<Integer> trackList = getTrackList3(loc);
Thread.sleep(200);
// 找到滑動的圓球
WebElement slider = driver.findElement(By.xpath("//div[@class='geetest_slider_button']"));
// 滑鼠按住滑塊
Actions actions = new Actions(driver);
actions.clickAndHold(slider).perform();
int a = 0;
Point start =slider.getLocation();
System.out.println(slider.getLocation().toString());
Thread.sleep(500+random.nextInt(500));
System.out.println(slider.getLocation().toString());
for (int i = 0; i < trackList.size(); i++) {
a += trackList.get(i);
actions.clickAndHold(slider).moveByOffset(trackList.get(i), 0);
if(i<trackList.size()*4/5) {
actions.pause(random.nextInt(20)+100);
}else {
actions.pause(random.nextInt(20)+200);
}
}
System.out.println("釋放前"+slider.getLocation().toString());
actions.release(slider).build().perform();
System.out.println("釋放後"+slider.getLocation().toString());
}
//獲取軌跡
public static List<Integer> getTrackList(int loc) {
List<Integer> list = new ArrayList<>();
list.add(loc * 4 / 15);
list.add(loc * 2 / 15);
list.add(loc * 4 / 15);
list.add(loc * 5 / 24);
list.add(loc - (loc * 4 / 15 + loc * 2 / 15 + loc * 4 / 15 + loc * 5 / 24));
return list;
}
//獲取軌跡2
public void getTrackList2(