虛擬瀏覽器(WebClient)應用簡單例子
阿新 • • 發佈:2019-01-29
WebClient 是一個類似虛擬瀏覽器的網頁抓取包,一個主要特點是適合動態頁面的抓取,如Javascript動態生成的網頁(Jsoup好像就做不了了)。
首先要引入包,主要是htmlunit,不過這東東包比較散,要執行還要引入一大堆的包,如下:
以下是一個簡單的應用例子:
package j2seTest2; import java.net.URL; import com.gargoylesoftware.htmlunit.JavaScriptPage; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlDivision; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlPage; public class WebClientTest { public static void main(String[] args) { // final String url = "http://tv.cntv.cn/epg"; // final String url = "http://weixin.sogou.com/gzh?openid=oIWsFt3aMWa50-g2CZwbXYUqhdpI"; final String format = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=%s&page=%s&t=%s"; final String openid="oIWsFt9udi0U5dw56-s0dPWW85pM"; final String url = String.format(format, openid, "" + 1, System.currentTimeMillis()); final WebClient client = new WebClient(); client.getOptions().setJavaScriptEnabled(true);// 預設執行js client.getOptions().setCssEnabled(false); client.setAjaxController(new NicelyResynchronizingAjaxController()); client.getOptions().setThrowExceptionOnScriptError(false); try { Page page = client.getPage(new URL(url)); if (page instanceof HtmlPage) { HtmlPage hPage = (HtmlPage) page; System.out.println("~~~HtmlPage"); System.out.println(hPage.getTitleText()); // HtmlForm form = hPage.getForms().get(0); // HtmlDivision div = (HtmlDivision) form.getByXPath("//div[@id='zhu1']").get(1); // List<HtmlElement> ahtmpr = div.getHtmlElementsByTagName("a"); // System.out.println(hPage.asXml()); System.out.println(hPage.asText()); } else if (page instanceof JavaScriptPage) { JavaScriptPage jPage = (JavaScriptPage) page; System.out.println("~~~JavaScriptPage"); System.out.println("statusCode:" + jPage.getWebResponse().getStatusCode()); System.out.println(jPage.getContent()); } } catch (Exception e) { e.printStackTrace(); } finally { client.closeAllWindows(); } } }