java+chromeDriver實現微博爬蟲
阿新 • • 發佈:2019-02-16
首先在maven中匯入
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.4.0</version>
</dependency>
</dependencies>
- 1
- 2
- 3
- 4
- 5
- 6
- 7
然後下載chromeDriver(本人是在Linux下執行測試的)
直接上程式碼:
import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; import java.io.File; import java.io.IOException; import java.time.LocalDate; import java.util.List; /** * @author yinren */ public class Login { private static ChromeDriverService service; private static WebDriver webDriver; /** * 建立一個瀏覽器例項 * @return webDriver */ public WebDriver getChromeDriver(){ System.setProperty("webdriver.chrome.driver","src/chromedriver"); //建立一個 ChromeDriver 介面 service = new ChromeDriverService.Builder().usingDriverExecutable(new File("src/chromedriver")).usingAnyFreePort().build(); try { service.start(); } catch (IOException e) { e.printStackTrace(); System.out.println("ChromeDriverService啟動異常"); } //建立一個 chrome 瀏覽器例項 return new RemoteWebDriver(service.getUrl(), DesiredCapabilities.chrome()); } /** * 模擬新浪微博登入 * @param name 使用者名稱 * @param password 密碼 */ public void login(String name,String password){ webDriver = getChromeDriver(); webDriver.get("http://login.sina.com.cn/"); WebElement elementName = webDriver.findElement(By.name("username")); elementName.sendKeys(name); WebElement elementPassword = webDriver.findElement(By.name("password")); elementPassword.sendKeys(password); WebElement elementClick = webDriver.findElement(By.xpath("//*[@id=\"vForm\"]/div[2]/div/ul/li[7]/div[1]/input")); elementClick.click(); } /** * 進行爬取 * @param key 用於獲取正確的微博連結:http://s.weibo.com/weibo/%25E9%2598%259A%25E6%25B8%2585%25E5%25AD%2590%25E5%259B%259E%25E5%25BA%2594%25E5%2588%2586%25E6%2589%258B%25E4%25BC%25A0%25E9%2597%25BB */ public void search(String key){ webDriver.get("http://s.weibo.com/"); WebElement elementKey = webDriver.findElement(By.className("searchInp_form")); elementKey.sendKeys(key); WebElement elementClick = webDriver.findElement(By.className("searchBtn")); elementClick.click(); //搜尋特定日期的微博內容 LocalDate localDate = LocalDate.now(); String currentUrl = webDriver.getCurrentUrl().split("&")[0]; System.out.println("currentUrl: " + currentUrl); String url = currentUrl + "&typeall=1&suball=1×cope=custom:" + localDate + ":&Refer=g"; webDriver.get(url); //處理當前頁面內容 handlePage(); } //頁面處理 public void handlePage(){ while (true){ //sleep的作用是對付微博的反爬蟲機制,抓取太快可能會判定為機器人,需要輸入驗證碼 try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } //先判斷是否有內容 if (checkContent()){ getContent(); //判斷是否有下一頁按鈕 if (checkButton()){ //拿到下一頁按鈕 WebElement elementButton = webDriver.findElement(By.xpath("//a[@class='page next S_txt1 S_line1']")); elementButton.click(); }else { System.out.println("沒有下一頁"); break; } }else { System.out.println("內容搜尋完畢"); break; } } } /** * 檢查頁面是否還有內容 * @return */ public Boolean checkContent(){ boolean flag; try { webDriver.findElement(By.xpath("//div[@class='pl_noresult']")); flag = false; }catch (Exception e){ flag = true; } return flag; } /** * 檢查是否有下一頁 * @return */ public Boolean checkButton(){ boolean flag; try { webDriver.findElement(By.xpath("//a[@class='page next S_txt1 S_line1']")); flag = true; }catch (Exception e){ flag = false; } return flag; } public void getContent(){ List<WebElement> elementNodes = webDriver.findElements(By.xpath("//div[@class='WB_cardwrap S_bg2 clearfix']")); //在執行過程中微博數==0的情況,可能是微博反爬機制,需要輸入驗證碼 if (elementNodes == null){ String url = webDriver.getCurrentUrl(); webDriver.get(url); getContent(); return; } for (WebElement element : elementNodes){ String bz_name = element.findElement(By.xpath(".//div[@class='feed_content wbcon']/a[@class='W_texta W_fb']")).getText(); System.out.println("博主暱稱: " + bz_name); String bz_homePage = element.findElement(By.xpath(".//div[@class='feed_content wbcon']/a[@class='W_texta W_fb']")).getAttribute("href"); System.out.println("博主主頁: " + bz_homePage); String wb_approve; try { wb_approve = element.findElement(By.xpath(".//div[@class='feed_content wbcon']/a[@class='approve_co']")).getAttribute("title"); }catch (Exception e){ wb_approve = ""; } System.out.println("微博認證: " + wb_approve); String wb_intelligent; try { wb_intelligent = element.findElement(By.xpath(".//div[@class='feed_content wbcon']/a[@class='ico_club']")).getAttribute("title"); }catch (Exception e){ wb_intelligent = ""; } System.out.println("微博達人: " + wb_intelligent); String wb_content; try { wb_content = element.findElement(By.xpath(".//div[@class='feed_content wbcon']/p[@class='comment_txt']")).getText(); }catch (Exception e){ wb_content = ""; } System.out.println("微博內容: " + wb_content); String publishTime; try { publishTime = element.findElement(By.xpath(".//div[@class='feed_from W_textb']/a[@class='W_textb']")).getText(); }catch (Exception e){ publishTime = ""; } System.out.println("釋出時間: " + publishTime); String wb_address; try { wb_address = element.findElement(By.xpath(".//div[@class='feed_from W_textb']/a[@class='W_textb']")).getAttribute("href"); }catch (Exception e){ wb_address = ""; } System.out.println("微博地址: " + wb_address); String wb_source; try { wb_source = element.findElement(By.xpath(".//div[@class='feed_from W_textb']/a[@rel]")).getText(); }catch (Exception e){ wb_source = ""; } System.out.println("微博來源: " + wb_source); String transmitText; int transmitNum = 0; try { transmitText = element.findElement(By.xpath(".//a[@action-type='feed_list_forward']//em")).getText(); transmitNum = Integer.parseInt(transmitText); }catch (Exception e){ } System.out.println("轉發次數: " + transmitNum); int commentNum = 0; try { String commentText = element.findElement(By.xpath(".//a[@action-type='feed_list_comment']//em")).getText(); commentNum = Integer.parseInt(commentText); }catch (Exception e){ } System.out.println("評論次數: " + commentNum); int praiseNum = 0; try { String praiseText = element.findElement(By.xpath(".//a[@action-type='feed_list_like']//em")).getText(); praiseNum = Integer.parseInt(praiseText); }catch (Exception e){ } System.out.println("點贊次數: " + praiseNum); System.out.print("-----------------------------------------------------------"); System.out.println(); } } } public static void main(String[] args) { Login login = new Login(); login.login("XXX","XXX"); login.search("安吉吃火鍋"); }