1. 程式人生 > >JAVA抓匯實務示範(3) by htmlunit

JAVA抓匯實務示範(3) by htmlunit

JAVA抓匯實務示範(3) by htmlunit

Coding:

(這邊我的專案是有把新的url一層一層往下一個getData丟,這裡就不多做更動)

/**
* 連結抓取資料處理 網頁連線 or 規則複雜的資料擷取
*/
@Override
public String getData(String url) throws Exception {
HTML htmlx = new HTMLX(url, false);
htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());
// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁
// htmlx.setReadTimeout(parameter.getReadTimeout(),
// parameter.getRetry());
htmlx.connect();
htmlx.extractInputStream(parameter.getEncoding());
String source = htmlx.getCodeStringtype();
Document doc = Jsoup.parse(source);
Document table = Jsoup.parseBodyFragment(doc.getElementById(“content-main”).toString());
String a = table.select(“h4 a”).first().attr(“href”);
String source1 = a.toString();
return source1;
}
public String getData2(String url) throws Exception {
HTML htmlx = new HTMLX(url, false);
htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());
// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁
// htmlx.setReadTimeout(parameter.getReadTimeout(),
// parameter.getRetry());
htmlx.connect();
htmlx.extractInputStream(parameter.getEncoding());
String source = htmlx.getCodeStringtype();
Document doc = Jsoup.parse(source);
Document table = Jsoup.parseBodyFragment(doc.getElementById(“content”).toString());
String source1 = table.toString();
String source2 = “”;
source2 = url.replace(“index.aspx”, getUrl(source1));
return source2;
}
public String getData3(String url) throws Exception {
HTML htmlx = new HTMLX(url, false);
htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());
// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁
// htmlx.setReadTimeout(parameter.getReadTimeout(),
// parameter.getRetry());
htmlx.connect();
htmlx.extractInputStream(parameter.getEncoding());
String source = htmlx.getCodeStringtype();
Document doc = Jsoup.parse(source);
Document table = Jsoup.parseBodyFragment(doc.getElementsByClass(“Table740”).toString());
String source1 = table.toString();
String source2 = “”;
source2 = url.replace(“weoselgr.aspx”, getUrl1(source1));
return source2;
}

這面這些Jsoup段的說明暫略

/**
* 模擬瀏覽器
*/
public void BuildBrowser() {
// 關閉日誌輸出(紅色內部執行錯誤)
LogFactory.getFactory().setAttribute(“org.apache.commons.logging.Log”,
“org.apache.commons.logging.impl.NoOpLog”);
webClient = new WebClient(BrowserVersion.FIREFOX_24);
// JavaScript元件
webClient.getOptions().setJavaScriptEnabled(false);
// CSS元件
webClient.getOptions().setCssEnabled(false);
// AJAX元件
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
// 網頁TIMEOUT
webClient.getOptions().setTimeout(80000);
// 跳轉
webClient.getOptions().setRedirectEnabled(true);
// IE元件
webClient.getOptions().setActiveXNative(true);
// 是否丟擲頁面javascript錯誤
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 是否丟擲response的錯誤
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// SSL憑證更新
webClient.getOptions().setUseInsecureSSL(true);
// 等待JavaScript執行完後的延遲時間
webClient.waitForBackgroundJavaScript(10000);
// 等待JavaScript執行前的延遲時間
webClient.waitForBackgroundJavaScriptStartingBefore(10000);
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getCurrentWindow().setInnerHeight(Integer.MAX_VALUE);
}

開始預備做htmlunit的行為

private WebClient webClient;
public String getData4(String url) throws Exception {
HtmlPage page = webClient.getPage(url);
Page page1 = null;
List<DomElement> atags = page.getElementsByTagName(“input”);
for (DomElement atag : atags) {
if (atag.getAttribute(“id”).equals(“bc”)) {
page1 = ((HtmlImageInput) atag).click();
}
}
String source = page1.getUrl().toString();
return source;
}
public String getData5(String url) throws Exception {
HtmlPage page = webClient.getPage(url);
Page page1 = null;
List<DomElement> atags = page.getElementsByTagName(“input”);
for (DomElement atag : atags) {
// System.out.println(atag);
if (atag.getAttribute(“value”).equals(“NGDPDPC”)) {
page1 = ((HtmlCheckBoxInput) atag).setChecked(true);
}
if (atag.getAttribute(“value”).equals(“NGSD_NGDP”)) {
page1 = ((HtmlCheckBoxInput) atag).setChecked(true);
}
if (atag.getAttribute(“value”).equals(“LUR”)) {
page1 = ((HtmlCheckBoxInput) atag).setChecked(true);
}
if (atag.getAttribute(“value”).equals(“LE”)) {
page1 = ((HtmlCheckBoxInput) atag).setChecked(true);
}
if (atag.getAttribute(“value”).equals(“LP”)) {
page1 = ((HtmlCheckBoxInput) atag).setChecked(true);
}
}
List<DomElement> atags1 = page.getElementsByTagName(“input”);
for (DomElement atag : atags1) {
if (atag.getAttribute(“id”).equals(“bc”)) {
page1 = ((HtmlImageInput) atag).click();
}
}
String source = page1.getUrl().toString();
return source;
}
public String getData6(String url) throws Exception {
HtmlPage page = webClient.getPage(url);
HtmlSelect select = page.getElementByName(“sy”);
List<HtmlOption> options = select.getOptions();
for (HtmlOption Option : options) {
if (Option.getAttribute(“value”).equals(“1980”))
page = select.setSelectedAttribute(Option, true);
}
HtmlSelect select2 = page.getElementByName(“ey”);
List<HtmlOption> options2 = select2.getOptions();
for (HtmlOption Option : options2) {
if (Option.getAttribute(“value”).equals(date.substring(0, 4))){
page = select2.setSelectedAttribute(Option, true);
}
}
Page page1 = null;
List<DomElement> atags = page.getElementsByTagName(“input”);
for (DomElement atag : atags) {
if (atag.getAttribute(“id”).equals(“pr”)) {
page1 = ((HtmlImageInput) atag).click();
}
}
String source = page1.getUrl().toString();
return source;
}
public String getData7(String url) throws Exception {
HtmlPage page = webClient.getPage(url);
List<DomElement> atags = page.getElementsByTagName(“a”);
Page downloadPage = null;
for (DomElement atag : atags) {
if(atag.asText().contains(“Your WEO Report”))
downloadPage = ((HtmlAnchor) atag).click();
}
download(downloadPage,”WEO_Data”);
return “”;
}
/**
* 分析資料 存入暫存 網頁資料剖析 , 外部檔欄位
*/
private String getUrl(String source) {
Document doc = Jsoup.parse(source);
String url = “”;
Elements linkAll = doc.select(“a[href]”);
for (Element link : linkAll) {
String value = link.attr(“href”);
if (link.toString().contains(“By Countries (country-level data)”)) {
url = value;
}
}
return url;
}
private String getUrl1(String source) {
Document doc = Jsoup.parse(source);
String url = “”;
Elements linkAll = doc.select(“a[href]”);
for (Element link : linkAll) {
String value = link.attr(“href”);
if (link.toString().contains(“All countries”)) {
url = value;
}
}
return url;
}
// 下載xls 用 輸入檔名 丟入temp
private void download(Page page,String filename) throws IOException{
InputStream is = page.getWebResponse().getContentAsStream();
OutputStream fos = new FileOutputStream(System.getProperty(“user.dir”)+”/data/”+date+filename+”.xls”);
byte[] buffer=new byte[1024*30];
int len=-1;
while((len=is.read(buffer))>0){
fos.write(buffer, 0, len);
}
fos.flush();
fos.close();
}
20180907只把source code po上來 沒有整理