htmlunit爬取js非同步載入後的頁面
阿新 • • 發佈:2019-02-06
直接上程式碼:
一、 index.html
呼叫後臺請求獲取content中的內容。
<html>
<head>
<script type="text/javascript" src="./jquery.min.js"></script>
</head>
<body>
<h2>Hello World!</h2>
<div id="content"></div>
<script type="text/javascript">
$(document).ready(function (){
$.post("/evh/test/testList",{},function(data){
$("#content").text(JSON.stringify(data));
});
});
</script>
</body>
</html>
二、TestController.java
/test/testList介面從後臺資料庫獲取資料。
package com.everhomes.proxy.controller;
import javax.annotation.Resource;
import org.slf 4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import com.everhomes.proxy.mapper.TestMapper;
@RestController
@RequestMapping("/test" )
public class TestController {
private static final Logger logger = LoggerFactory.getLogger(TestController.class);
@Resource
private TestMapper testMapper;
@RequestMapping("testList")
public Object testList(){
return testMapper.testList();
};
@ExceptionHandler(Exception.class)
public Object exception(Exception e){
logger.error("error: ", e);
return "error: " + e.toString();
}
}
三、Crawler.java
package com.everhomes.generate;
import java.io.IOException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Crawler {
public static void main(String[] args) throws IOException, InterruptedException {
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setRedirectEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setTimeout(50000);
HtmlPage rootPage = webClient.getPage("http://localhost:8080/evh/index.html");
webClient.waitForBackgroundJavaScript(10000);
FileUtils.createFile(DIRECTORY+"cc.html", rootPage.asXml());
webClient.close();
}
}
四、pom.xml
新增相關依賴。
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit-core-js</artifactId>
<version>2.23</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.25</version>
</dependency>