1. 程式人生 > >虛擬瀏覽器(WebClient)應用簡單例子

虛擬瀏覽器(WebClient)應用簡單例子

WebClient 是一個類似虛擬瀏覽器的網頁抓取包,一個主要特點是適合動態頁面的抓取,如Javascript動態生成的網頁(Jsoup好像就做不了了)。

首先要引入包,主要是htmlunit,不過這東東包比較散,要執行還要引入一大堆的包,如下:

以下是一個簡單的應用例子:

package j2seTest2;

import java.net.URL;

import com.gargoylesoftware.htmlunit.JavaScriptPage;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class WebClientTest {

	public static void main(String[] args) {
//		final String url = "http://tv.cntv.cn/epg";
//		final String url = "http://weixin.sogou.com/gzh?openid=oIWsFt3aMWa50-g2CZwbXYUqhdpI";
		final String format = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=%s&page=%s&t=%s";
		final String openid="oIWsFt9udi0U5dw56-s0dPWW85pM";
		final String url = String.format(format, openid, "" + 1, System.currentTimeMillis());
		
		final WebClient client = new WebClient();
		client.getOptions().setJavaScriptEnabled(true);// 預設執行js
		client.getOptions().setCssEnabled(false);
		client.setAjaxController(new NicelyResynchronizingAjaxController());
		client.getOptions().setThrowExceptionOnScriptError(false);
		
		try {
			Page page = client.getPage(new URL(url));
			if (page instanceof HtmlPage) {
				HtmlPage hPage = (HtmlPage) page;
				System.out.println("~~~HtmlPage");
				System.out.println(hPage.getTitleText());
//				HtmlForm form = hPage.getForms().get(0);
//				HtmlDivision div = (HtmlDivision) form.getByXPath("//div[@id='zhu1']").get(1);
//				List<HtmlElement> ahtmpr = div.getHtmlElementsByTagName("a");
//				System.out.println(hPage.asXml());
				System.out.println(hPage.asText());
			} else if (page instanceof JavaScriptPage) {
				JavaScriptPage jPage = (JavaScriptPage) page;
				System.out.println("~~~JavaScriptPage");
				System.out.println("statusCode:" + jPage.getWebResponse().getStatusCode());
				System.out.println(jPage.getContent());
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			client.closeAllWindows();
		}
	}

}