使用HttpComponents抓取網頁內容
阿新 • • 發佈:2019-01-04
匯入HttpComponents的包
下載之後解壓,找到bin目錄,匯入這三個包就行
或者是使用maven
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
抓取程式碼
新建一個實體類,儲存抓取的資料
public class WebEntity implements Serializable{
public String id;
public String url;
public String content;
/**
* 獲取網頁內容
*
* @param path
* @return
*/
public static List<WebEntity> catchWebContent(String path) {
List<WebEntity> list = new ArrayList<>();
//建立httpclient物件
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(path);
CloseableHttpResponse response = null;
try {
//使用httpclient傳送請求
response = httpClient.execute(httpGet);
//響應碼為200是表示成功
if (response.getStatusLine().getStatusCode() == 200 ) {
//需要設定編碼,這裡主要看抓取的頁面的編碼,編碼不一致會使結果亂碼
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
regxContent(content,list);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return list;
}
/**
* 使用正則表示式匹配抓取的內,這裡關於正則的使用就不講解了
* @param content
* @param list
*/
public static void regxContent(String content, List<WebEntity> list) {
//匹配所有a標籤
String regex_str="<a[^>]+>[^<]*</a>";
Pattern pattern = Pattern.compile(regex_str);
Matcher matcher = pattern.matcher(content);
while (matcher.find()){
WebEntity webEntity = new WebEntity();
webEntity.setId(genUUID());
String href = matcher.group();
webEntity.setContent(href);
list.add(webEntity);
}
}
/**
* 得到UUID值
*
* @return
*/
public static String genUUID() {
return UUID.randomUUID().toString().replace("-", "");
}
測試
//這裡抓取的新浪新聞首頁的資訊
String url = "http://news.sina.com.cn/";
HttpUtil httpUtil = new HttpUtil();
List<WebEntity> webEntities = httpUtil.catchWebContent(url);
for(WebEntity webEntity:webEntities){
System.out.println(webEntity.getContent());
System.out.println(webEntity.getUrl());
System.out.println("===================================================");
}