1. 程式人生 > >二、爬蟲輔助工具 selenium

二、爬蟲輔助工具 selenium

一、應用場景

  1. 頁面點選拖動,載入內容
  2. 元素拖動,驗證碼破解
  3. 執行js,資料獲取
  4. 模擬登入
  5. 引數無法獲取

二、準備

  1. 瀏覽器:chrome
  2. 驅動包:瀏覽器版本、http://chromedriver.storage.googleapis.com/index.html 下載對應的驅動
  3. Java對應webdriver 依賴的jar包

三、專案

(一)新增依賴

		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <version>3.141.59</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-chrome-driver</artifactId> <version>3.141.59</version>
</dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-server</artifactId> <version>3.141.59</version> </dependency>

(二)模擬百度登入

package org.pc.demo;

import org.openqa.selenium.By;
import org.
openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import java.util.Scanner; import java.util.Set; /** * @author 鹹魚 * @date 2018/12/21 21:00 */ public class SeleniumTest { private static Set<Cookie> cookies; public static void main(String[] args) { login(); } public static void login() { //驅動位置 System.getProperties().setProperty("webdriver.chrome.driver", "E:\\demo\\crawler\\chromedriver.exe"); ChromeOptions options = new ChromeOptions(); //瀏覽器位置 options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"); WebDriver driver = new ChromeDriver(options); //開啟百度登入頁面 driver.get("https://passport.baidu.com/v2/?login&tpl=mn"); //切換使用者名稱密碼登入 driver.findElement(By.id("TANGRAM__PSP_3__footerULoginBtn")).click(); //輸入賬號 driver.findElement(By.id("TANGRAM__PSP_3__userName")).clear(); driver.findElement(By.id("TANGRAM__PSP_3__userName")).sendKeys("***"); //輸入密碼 driver.findElement(By.id("TANGRAM__PSP_3__password")).clear(); driver.findElement(By.id("TANGRAM__PSP_3__password")).sendKeys("***"); //人工輸入驗證碼 Scanner sc = new Scanner(System.in); String s = sc.nextLine(); driver.findElement(By.id("TANGRAM__PSP_3__verifyCode")).clear(); driver.findElement(By.id("TANGRAM__PSP_3__verifyCode")).sendKeys(s); driver.findElement(By.id("TANGRAM__PSP_3__submit")).click(); //獲取cookie資訊*/ cookies = driver.manage().getCookies(); for (Cookie cookie : cookies) { System.out.println(cookie.getName() + ":" + cookie.getValue()); } //後續把cookie 新增進header driver.quit(); } }

(三)用selenium模擬登入,破解滑動驗證碼

package org.pc.demo;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import javax.imageio.ImageIO;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Point;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.interactions.Actions;

import sun.misc.BASE64Decoder;

/**
 * 新版本極驗官網測試舊版滑塊演算法(失敗)
 * 有需求可對接打碼平臺
 */
public class JiYan {


    public static void main(String[] args) throws Exception {
        run1();
    }

    public static void decodeBase64ToImage(String base64, String imgPath) {
        BASE64Decoder decoder = new BASE64Decoder();
        try {
            File file = new File(imgPath);
            if (file.exists()) {
                file.delete();
            }
            FileOutputStream write = new FileOutputStream(new File(imgPath));
            byte[] decoderBytes = decoder.decodeBuffer(base64.replace("data:image/png;base64,", ""));
            write.write(decoderBytes);
            write.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void doGet(String url) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault();

        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = null;

        HttpEntity entity = null;
        String s = "";
        try {
            httpGet.addHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36");
            response = httpClient.execute(httpGet);

            entity = response.getEntity();

            System.out.println("doGet" + response.getStatusLine().getStatusCode());
            s = EntityUtils.toString(entity, "UTF-8");

        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            httpClient.close();
        }
    }

    // 判斷畫素是否相差過大
    private static boolean isSimilar(BufferedImage image1, BufferedImage image2, int x, int y) {
        int pixel1 = image1.getRGB(x, y);
        int pixel2 = image2.getRGB(x, y);

        int[] rgb1 = getRGB(pixel1);
        int[] rgb2 = getRGB(pixel2);

        for (int i = 0; i < 3; i++) {
            if (Math.abs(rgb1[i] - rgb2[i]) > 80) {
                return false;
            }
        }
        return true;
    }

    // 返回RGB陣列
    private static int[] getRGB(int pixel) {
        int[] rgb = new int[3];
        rgb[0] = (pixel & 0xff0000) >> 16;
        rgb[1] = (pixel & 0xff00) >> 8;
        rgb[2] = (pixel & 0xff);
        return rgb;
    }

    // 計算移動距離
    private static int getDiffLocation(BufferedImage image1, BufferedImage image2) {
        int i = 0;
        for (int x = 0; x < 260; x++) {
            for (int y = 0; y < 116; y++) {
                if (isSimilar(image1, image2, x, y) == false) {
                    return x;
                }
            }
        }
        return i;
    }

    public static void run1() throws Exception {
        //設定chrome驅動路徑
        System.getProperties().setProperty("webdriver.chrome.driver", "E:\\demo\\crawler\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        //設定chrome瀏覽器位置
        options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
        WebDriver driver = new ChromeDriver(options);
        driver.get("http://www.geetest.com/type/");
        //暴力延遲
        Thread.sleep(1000);
        driver.findElement(By.xpath("//div[@class='products-content']/ul/li[3]")).click();
        Thread.sleep(1000);
        driver.findElement(By.xpath("//div[@class='products-content']/ul/li[2]")).click();
        Thread.sleep(1000);
        driver.findElement(By.xpath("//div[@class='geetest_radar_btn']/div[@class='geetest_radar_tip']/span")).click();
        System.out.println(driver.findElement(By.xpath("//div[@class='geetest_radar_tip']/span")).getText());
        Thread.sleep(1000);
        //執行js 獲取驗證碼圖片 畫布轉base64
        JavascriptExecutor js = (JavascriptExecutor) driver;
        String jsstr = "var oCanvas =document.getElementsByClassName('geetest_canvas_fullbg geetest_fade geetest_absolute')[0];" +
                "return oCanvas.toDataURL();";
        String o = (String) js.executeScript(jsstr);
        decodeBase64ToImage(o, "E:\\a1.png");
        String jsstr1 = "var oCanvas =document.getElementsByClassName('geetest_canvas_bg geetest_absolute')[0];" +
                "return oCanvas.toDataURL();";
        String o1 = (String) js.executeScript(jsstr1);
        decodeBase64ToImage(o1, "E:\\a2.png");

        // 獲取移動距離
        Random random = new Random();
        BufferedImage image1 = ImageIO.read(new FileInputStream("E:/a1.png"));
        BufferedImage image2 = ImageIO.read(new FileInputStream("E:/a2.png"));
        int ranAddLoc = -2;
        int loc = (getDiffLocation(image1, image2) - 5) + ranAddLoc;
        System.out.println(loc);
        //移動演算法
        List<Integer> trackList = getTrackList3(loc);
        Thread.sleep(200);
        // 找到滑動的圓球
        WebElement slider = driver.findElement(By.xpath("//div[@class='geetest_slider_button']"));

        // 滑鼠按住滑塊
        Actions actions = new Actions(driver);
        actions.clickAndHold(slider).perform();
        int a = 0;
        Point  start =slider.getLocation();
        System.out.println(slider.getLocation().toString());
        Thread.sleep(500+random.nextInt(500));

        System.out.println(slider.getLocation().toString());
        for (int i = 0; i < trackList.size(); i++) {
            a += trackList.get(i);
            actions.clickAndHold(slider).moveByOffset(trackList.get(i), 0);
            if(i<trackList.size()*4/5) {
                actions.pause(random.nextInt(20)+100);
            }else {
                actions.pause(random.nextInt(20)+200);
            }
        }

        System.out.println("釋放前"+slider.getLocation().toString());
        actions.release(slider).build().perform();
        System.out.println("釋放後"+slider.getLocation().toString());
    }
    //獲取軌跡
    public static List<Integer> getTrackList(int loc) {
        List<Integer> list = new ArrayList<>();
        list.add(loc * 4 / 15);
        list.add(loc * 2 / 15);
        list.add(loc * 4 / 15);
        list.add(loc * 5 / 24);
        list.add(loc - (loc * 4 / 15 + loc * 2 / 15 + loc * 4 / 15 + loc * 5 / 24));
        return list;
    }
    //獲取軌跡2
    public  void getTrackList2(