Jsoup入門學習一

阿新 • • 發佈：2020-12-09

1、Jsoup是一款Java 的HTML解析器，可直接解析某個URL地址、HTML文字內容。它提供了一套非常省力的API，可通過DOM，CSS以及類似於jQuery的操作方法來取出和操作資料。

2、Jsoup 的主要功能，如下所示：

　　1)、從一個URL，檔案或字串中解析HTML；
　　2)、使用DOM或CSS選擇器來查詢、取出資料；
　　3)、可操作HTML元素、屬性、文字；
　　4)、Jsoup 是基於MIT協議釋出的，可放心使用於商業專案。

3、httpClient 結合Jsoup 獲取到網頁內容進行解析，首先需要引入httpClient和Jsoup的依賴，如下所示：

 1 <project xmlns="http://maven.apache.org/POM/4.0.0"
 2     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
 4       http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5     <modelVersion>4.0.0</modelVersion>
 6 
 7     <groupId>com.bie</groupId>
 8     <artifactId>jsoup</artifactId>
 9     <version>0.0.1-SNAPSHOT</version>
10     <packaging>jar</packaging>
11 
12     <name>jsoup</name>
13     <url>http://maven.apache.org</url>
14 
15     <properties>
16         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
17     </properties>
18 
19     <dependencies>
20         <!-- 新增httpclient支援 -->
21         <dependency>
22             <groupId>org.apache.httpcomponents</groupId>
23             <artifactId>httpclient</artifactId>
24             <version>4.5.2</version>
25         </dependency>
26         <!-- jsoup版本依賴 -->
27         <dependency>
28             <groupId>org.jsoup</groupId>
29             <artifactId>jsoup</artifactId>
30             <version>1.8.3</version>
31         </dependency>
32     </dependencies>
33 </project>

雖然使用Jsoup可以替代HttpClient直接發起請求解析資料，但是往往不會這樣用，因為實際的開發過程中，需要使用到多執行緒，連線池，代理等等技術，而jsoup對這些技術的支援並不是很好，所以jsoup一般僅僅作為Html解析工具使用。

 1 package com.bie.spider.jsoup;
 2 
 3 import java.io.File;
 4 import java.io.IOException;
 5 import java.net.MalformedURLException;
 6 import java.net.URL;
 7 
 8 import org.apache.commons.io.FileUtils;
 9 import org.jsoup.Jsoup;
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 
13 /**
14  * 
15  * @author
16  *
17  */
18 public class JsoupDocument {
19 
20     /**
21      * 雖然使用Jsoup可以替代HttpClient直接發起請求解析資料，但是往往不會這樣用，
22      * 因為實際的開發過程中，需要使用到多執行緒，連線池，代理等等技術，
23      * 而jsoup對這些技術的支援並不是很好， 所以jsoup一般僅僅作為Html解析工具使用
24      * 
25      * @throws MalformedURLException
26      * @throws IOException
27      */
28     public static void inputUrl() throws MalformedURLException, IOException {
29         // Jsoup可以直接輸入url，它會發起請求並獲取資料，封裝為Document物件
30         // 使用jsoup解析url
31         Document doc = Jsoup.parse(new URL("https://www.autohome.com.cn/bestauto/"), 10000);
32 
33         // 把獲取的內容輸出為檔案
34         FileUtils.writeStringToFile(new File("D:/test.html"), doc.toString(), "UTF-8");
35 
36         // 獲取title
37         Element element = doc.getElementsByTag("title").first();
38 
39         // 列印title內容
40         System.out.println(element.text());
41     }
42 
43     /**
44      * 輸入字串，Jsoup可以直接輸入字串，並封裝為Document物件
45      * 
46      * @throws MalformedURLException
47      * @throws IOException
48      */
49     public static void inputCharacter() throws MalformedURLException, IOException {
50         // 讀取檔案，獲取字串
51         String html = FileUtils.readFileToString(new File("D:/test.html"), "UTF-8");
52 
53         // 解析字串
54         Document doc = Jsoup.parse(html);
55 
56         // 獲取title標籤
57         Element element = doc.getElementsByTag("title").first();
58 
59         // 列印title內容
60         System.out.println(element.text());
61     }
62 
63     /**
64      * 輸入檔案，Jsoup可以直接輸入檔案，並封裝為Document物件
65      * 
66      * @throws MalformedURLException
67      * @throws IOException
68      */
69     public static void inputFile() throws MalformedURLException, IOException {
70         // 使用jsoup解析檔案
71         Document doc = Jsoup.parse(new File("D:/test.html"), "UTF-8");
72 
73         // 獲取title標籤
74         Element element = doc.getElementsByTag("title").first();
75 
76         // 列印title內容
77         System.out.println(element.text());
78     }
79 
80     public static void main(String[] args) throws MalformedURLException, IOException {
81         // 輸入url
82         inputUrl();
83         // 輸入字串
84         inputCharacter();
85         // 輸入檔案
86         inputFile();
87     }
88 
89 }

httpClient 結合Jsoup 獲取到網頁內容進行解析，具體程式碼，如下所示：

 1 package com.bie.jsoup;
 2 
 3 import java.io.IOException;
 4 
 5 import org.apache.http.Header;
 6 import org.apache.http.HttpEntity;
 7 import org.apache.http.ParseException;
 8 import org.apache.http.StatusLine;
 9 import org.apache.http.client.ClientProtocolException;
10 import org.apache.http.client.methods.CloseableHttpResponse;
11 import org.apache.http.client.methods.HttpGet;
12 import org.apache.http.impl.client.CloseableHttpClient;
13 import org.apache.http.impl.client.HttpClients;
14 import org.apache.http.util.EntityUtils;
15 import org.jsoup.Jsoup;
16 import org.jsoup.nodes.Document;
17 import org.jsoup.nodes.Element;
18 import org.jsoup.select.Elements;
19 
20 public class HttpClientToJsoup {
21 
22     // 設定請求頭訊息 User-Agent 模擬瀏覽器
23     private static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0";
24 
25     /**
26      * HttpGet請求操作
27      */
28     public static void httpGetRequest() {
29         // 第一步，獲取到httpClient客戶端例項，獲取到一個可關閉的httpClient客戶端例項。
30         CloseableHttpClient httpClient = HttpClients.createDefault();
31         // 第二步，建立HttpGet或者HttpPost例項。
32         String uri = "https://www.cnblogs.com/biehongli";
33         HttpGet httpGet = new HttpGet(uri);
34         // 設定請求頭訊息 User-Agent 模擬瀏覽器
35         httpGet.setHeader("User-Agent", UserAgent);
36 
37         // 定義一個可響應的例項物件。
38         CloseableHttpResponse response = null;
39         try {
40             // 第三步，釋出一個請求，使用httpClient例項傳送一個http協議的Get請求。
41             response = httpClient.execute(httpGet);
42             // 獲取到響應的狀態
43             StatusLine statusLine = response.getStatusLine();
44             System.out.println("響應狀態： " + statusLine.toString() + ", 響應碼： " + statusLine.getStatusCode());
45         } catch (ClientProtocolException e) {
46             // http協議異常
47             e.printStackTrace();
48         } catch (IOException e) {
49             // io異常
50             e.printStackTrace();
51         }
52         // 第四步，獲取到返回的實體物件
53         HttpEntity entity = response.getEntity();
54         // 獲取響應內容型別 Content-Type，獲取到響應型別，從而過濾一些不想要的東西
55         Header contentType = entity.getContentType();
56         // 列印響應資訊
57         System.out.println("name : " + contentType.getName() + " , value: " + contentType.getValue());
58 
59         // 將返回結果轉換為字串進行檢視(網頁內容)，引數一是請求返回的entity，引數二是字符集編碼
60         String result = null;
61         try {
62             result = EntityUtils.toString(entity, "UTF-8");
63         } catch (ParseException e) {
64             e.printStackTrace();
65         } catch (IOException e) {
66             e.printStackTrace();
67         }
68         // 列印請求返回的結果
69         // System.out.println(result.toString());
70 
71         // 講獲取到網頁轉換成為jsoup進行處理，返回結果為Document文件
72         Document document = Jsoup.parse(result);
73         Elements elements = document.getElementsByTag("title");
74         Element element = elements.get(0);
75         System.out.println("獲取到的標題內容： " + element.text().toString());
76 
77         // 獲取到自己的部落格題目名稱內容
78         Elements elementsByClass = document.getElementsByClass("postTitle2");
79         for (Element e : elementsByClass) {
80             System.out.println("獲取到的標題內容是： " + e.text());
81         }
82 
83         // 第五步，關閉流，釋放資源
84         try {
85             response.close();
86             httpClient.close();
87         } catch (IOException e) {
88             e.printStackTrace();
89         }
90     }
91 
92     public static void main(String[] args) {
93         HttpClientToJsoup.httpGetRequest();
94     }
95 
96 }

4、Jsoup的主要作用是，用HttpClient獲取到網頁後，具體的網頁提取需要的資訊的時候，就用到Jsoup，Jsoup可以使用強大的類似Jquery，css選擇器，來獲取需要的資料；

Jsoup官方地址：https://jsoup.org/Jsoup最新下載：https://jsoup.org/downloadJsoup學習文件：https://jsoup.org/cookbook/introduction/parsing-a-document

5、Jsoup查詢DOM元素的主要方法，如下所示：

　　1)、getElementById(String id) 根據 id 來查詢 DOM。　　2)、getElementsByTag(String tagName) 根據 tag 名稱來查詢 DOM。　　3)、getElementsByClass(String className) 根據樣式名稱來查詢 DOM。　　4)、getElementsByAttribute(String key) 根據屬性名，標籤的屬性元素來查詢 DOM。　　5)、getElementsByAttributeValue(String key,String value) 根據屬性名和屬性值來查詢 DOM。

如果滿足不了自己的需求，可以使用Jsoup 的選擇器語法查詢 DOM 元素，如下所示：

  1 package com.bie.jsoup;
  2 
  3 import java.io.IOException;
  4 
  5 import org.apache.http.Header;
  6 import org.apache.http.HttpEntity;
  7 import org.apache.http.ParseException;
  8 import org.apache.http.StatusLine;
  9 import org.apache.http.client.ClientProtocolException;
 10 import org.apache.http.client.methods.CloseableHttpResponse;
 11 import org.apache.http.client.methods.HttpGet;
 12 import org.apache.http.impl.client.CloseableHttpClient;
 13 import org.apache.http.impl.client.HttpClients;
 14 import org.apache.http.util.EntityUtils;
 15 import org.jsoup.Jsoup;
 16 import org.jsoup.nodes.Document;
 17 import org.jsoup.nodes.Element;
 18 import org.jsoup.select.Elements;
 19 
 20 public class HttpClientToJsoup {
 21 
 22     // 設定請求頭訊息 User-Agent 模擬瀏覽器
 23     private static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0";
 24 
 25     /**
 26      * HttpGet請求操作
 27      */
 28     public static void httpGetRequest() {
 29         // 第一步，獲取到httpClient客戶端例項，獲取到一個可關閉的httpClient客戶端例項。
 30         CloseableHttpClient httpClient = HttpClients.createDefault();
 31         // 第二步，建立HttpGet或者HttpPost例項。
 32         String uri = "https://www.cnblogs.com/biehongli";
 33         HttpGet httpGet = new HttpGet(uri);
 34         // 設定請求頭訊息 User-Agent 模擬瀏覽器
 35         httpGet.setHeader("User-Agent", UserAgent);
 36 
 37         // 定義一個可響應的例項物件。
 38         CloseableHttpResponse response = null;
 39         try {
 40             // 第三步，釋出一個請求，使用httpClient例項傳送一個http協議的Get請求。
 41             response = httpClient.execute(httpGet);
 42             // 獲取到響應的狀態
 43             StatusLine statusLine = response.getStatusLine();
 44             System.out.println("響應狀態： " + statusLine.toString() + ", 響應碼： " + statusLine.getStatusCode());
 45         } catch (ClientProtocolException e) {
 46             // http協議異常
 47             e.printStackTrace();
 48         } catch (IOException e) {
 49             // io異常
 50             e.printStackTrace();
 51         }
 52         // 第四步，獲取到返回的實體物件
 53         HttpEntity entity = response.getEntity();
 54         // 獲取響應內容型別 Content-Type，獲取到響應型別，從而過濾一些不想要的東西
 55         Header contentType = entity.getContentType();
 56         // 列印響應資訊
 57         System.out.println("name : " + contentType.getName() + " , value: " + contentType.getValue());
 58 
 59         // 將返回結果轉換為字串進行檢視(網頁內容)，引數一是請求返回的entity，引數二是字符集編碼
 60         String result = null;
 61         try {
 62             result = EntityUtils.toString(entity, "UTF-8");
 63         } catch (ParseException e) {
 64             e.printStackTrace();
 65         } catch (IOException e) {
 66             e.printStackTrace();
 67         }
 68         // 列印請求返回的結果
 69         // System.out.println(result.toString());
 70 
 71         // 講獲取到網頁轉換成為jsoup進行處理，返回結果為Document文件
 72         Document document = Jsoup.parse(result);
 73         Elements elements = document.getElementsByTag("title");
 74         Element element = elements.get(0);
 75         System.out.println("獲取到的標題內容： " + element.text().toString());
 76 
 77         // 通過選擇器查詢所有部落格連結DOM,獲取到自己的部落格題目名稱內容
 78         // 使用Jsoup的選擇器
 79         Elements select = document.select("#centercontent .day .postTitle .postTitle2");
 80         for (Element e : select) {
 81             System.out.println("部落格標題：" + e.text());
 82         }
 83 
 84         // 帶有href屬性的a元素
 85         Elements selectHref = document.select(".postTitle a[href]");
 86         for (Element e : selectHref) {
 87             System.out.println("部落格標題：" + e.text());
 88         }
 89 
 90         // 查詢副檔名為.png的圖片DOM節點
 91         Elements imgElements = document.select("img[src$=.png]");
 92         for (Element e : imgElements) {
 93             System.out.println(e.toString());
 94         }
 95 
 96         // 獲取tag是title的所有DOM元素
 97         Element first = document.getElementsByTag("title").first();
 98         String title = first.text(); // 返回元素的文字
 99         System.out.println("網頁標題是：" + title);
100 
101         // 第五步，關閉流，釋放資源
102         try {
103             response.close();
104             httpClient.close();
105         } catch (IOException e) {
106             e.printStackTrace();
107         }
108     }
109 
110     public static void main(String[] args) {
111         HttpClientToJsoup.httpGetRequest();
112     }
113 
114 }

6、使用選擇器語法查詢元素，Jsoup elements物件支援類似於CSS (或jquery)的選擇器語法，來實現非常強大和靈活的查詢功能。這個select 方法在Document, Element,或Elements物件中都可以使用。且是上下文相關的，因此可實現指定元素的過濾，或者鏈式選擇訪問。Select方法將返回一個Elements集合，並提供一組方法來抽取和處理結果。如下所示：

  1 package com.bie.jsoup;
  2 
  3 import java.io.IOException;
  4 
  5 import org.apache.http.Header;
  6 import org.apache.http.HttpEntity;
  7 import org.apache.http.ParseException;
  8 import org.apache.http.StatusLine;
  9 import org.apache.http.client.ClientProtocolException;
 10 import org.apache.http.client.methods.CloseableHttpResponse;
 11 import org.apache.http.client.methods.HttpGet;
 12 import org.apache.http.impl.client.CloseableHttpClient;
 13 import org.apache.http.impl.client.HttpClients;
 14 import org.apache.http.util.EntityUtils;
 15 import org.jsoup.Jsoup;
 16 import org.jsoup.nodes.Document;
 17 import org.jsoup.nodes.Element;
 18 import org.jsoup.select.Elements;
 19 
 20 public class HttpClientToJsoup {
 21 
 22     // 設定請求頭訊息 User-Agent 模擬瀏覽器
 23     private static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0";
 24 
 25     /**
 26      * HttpGet請求操作
 27      */
 28     public static void httpGetRequest() {
 29         // 第一步，獲取到httpClient客戶端例項，獲取到一個可關閉的httpClient客戶端例項。
 30         CloseableHttpClient httpClient = HttpClients.createDefault();
 31         // 第二步，建立HttpGet或者HttpPost例項。
 32         String uri = "https://www.cnblogs.com/biehongli";
 33         HttpGet httpGet = new HttpGet(uri);
 34         // 設定請求頭訊息 User-Agent 模擬瀏覽器
 35         httpGet.setHeader("User-Agent", UserAgent);
 36 
 37         // 定義一個可響應的例項物件。
 38         CloseableHttpResponse response = null;
 39         try {
 40             // 第三步，釋出一個請求，使用httpClient例項傳送一個http協議的Get請求。
 41             response = httpClient.execute(httpGet);
 42             // 獲取到響應的狀態
 43             StatusLine statusLine = response.getStatusLine();
 44             System.out.println("響應狀態： " + statusLine.toString() + ", 響應碼： " + statusLine.getStatusCode());
 45         } catch (ClientProtocolException e) {
 46             // http協議異常
 47             e.printStackTrace();
 48         } catch (IOException e) {
 49             // io異常
 50             e.printStackTrace();
 51         }
 52         // 第四步，獲取到返回的實體物件
 53         HttpEntity entity = response.getEntity();
 54         // 獲取響應內容型別 Content-Type，獲取到響應型別，從而過濾一些不想要的東西
 55         Header contentType = entity.getContentType();
 56         // 列印響應資訊
 57         System.out.println("name : " + contentType.getName() + " , value: " + contentType.getValue());
 58 
 59         // 將返回結果轉換為字串進行檢視(網頁內容)，引數一是請求返回的entity，引數二是字符集編碼
 60         String result = null;
 61         try {
 62             result = EntityUtils.toString(entity, "UTF-8");
 63         } catch (ParseException e) {
 64             e.printStackTrace();
 65         } catch (IOException e) {
 66             e.printStackTrace();
 67         }
 68         // 列印請求返回的結果
 69         // System.out.println(result.toString());
 70 
 71         // 講獲取到網頁轉換成為jsoup進行處理，返回結果為Document文件
 72         Document document = Jsoup.parse(result);
 73 
 74         // Jsoup 使用selector選擇器
 75         Elements linkElements = document.select("#centercontent .day .postTitle .postTitle2"); // 通過選擇器查詢所有部落格連結DOM
 76         for (Element e : linkElements) {
 77             System.out.println("部落格標題：" + e.text());
 78             System.out.println("部落格地址：" + e.attr("href"));
 79             // System.out.println("target：" + e.attr("target"));
 80         }
 81 
 82         // Jsoup 使用selector選擇器
 83         Element linkElement = document.select(".postTitle2").first();
 84         System.out.println("純文字內容：" + linkElement.text());
 85         System.out.println("Html內容：" + linkElement.html());
 86 
 87         // 第五步，關閉流，釋放資源
 88         try {
 89             response.close();
 90             httpClient.close();
 91         } catch (IOException e) {
 92             e.printStackTrace();
 93         }
 94     }
 95 
 96     public static void main(String[] args) {
 97         HttpClientToJsoup.httpGetRequest();
 98     }
 99 
100 }

 1 1、Selector選擇器概述，如下所示：
 2 1)、tagname: 通過標籤查詢元素，比如：a。
 3 2)、ns|tag: 通過標籤在名稱空間查詢元素，比如：可以用 fb|name 語法來查詢 <fb:name> 元素。
 4 3)、#id: 通過ID查詢元素，比如：#logo。
 5 4)、.class: 通過class名稱查詢元素，比如：.masthead。
 6 5)、[attribute]: 利用屬性查詢元素，比如：[href]。
 7 6)、[attr=value]: 利用屬性值來查詢元素，比如：[width=500]。
 8 
 9 
10 // 案例使用，使用選擇器
11 // tagname: 通過標籤查詢元素，比如：a
12 str = doc.select("a").first().text();
13 
14 // ns|tag: 通過標籤在名稱空間查詢元素，比如：可以用 fb|name 語法來查詢 <fb:name> 元素
15 str = doc.select("jsoup|li").first().text();
16 
17 // #id: 通過ID查詢元素，比如：#logo
18 str = doc.select("#auto-header-fenzhan").first().text();
19 
20 // .class: 通過class名稱查詢元素，比如：.masthead
21 str = doc.select(".orangelink").first().text();
22 
23 // [attribute]: 利用屬性查詢元素，比如：[href]
24 str = doc.select("[abc]").first().text();
25 
26 // [attr=value]: 利用屬性值來查詢元素，比如：[width=500]
27 str = doc.select("[class=vlli]").first().text();
28 
29 
30 2、Selector選擇器組合使用，如下所示：
31 1)、el#id: 元素+ID，比如： div#logo。
32 2)、el.class: 元素+class，比如： div.masthead。
33 3)、el[attr]: 元素+屬性名，比如： a[href]。
34 4)、任意組合，比如：a[href].highlight。
35 5)、ancestor child: 查詢某個元素下子元素，比如：.body p 查詢"body"下的所有 p。
36 6)、parent > child: 查詢某個父元素下的直接子元素，比如：div.content > p 查詢 p。
37 7)、parent > * 查詢某個父元素下所有直接子元素。
38 
39 
40 // 案例使用，組合選擇器
41 // el#id: 元素+ID，比如： div#logo
42 str = doc.select("li#auto-header-fenzhan").first().text();
43 
44 // el.class: 元素+class，比如： div.masthead
45 str = doc.select("a.greylink").first().text();
46 
47 // el[attr]: 元素+屬性，比如： a[href]
48 str = doc.select("a[href]").first().attr("href");
49 
50 // 任意組合，比如：a[href].highlight
51 str = doc.select("a[href].greylink").first().attr("href");
52 
53 // ancestor child: 查詢某個元素下子元素，比如：可以用.body p 查詢"body"下的所有 p
54 str = doc.select("div.mini-left a").text();
55 
56 // parent > child: 查詢某個父元素下的直接子元素，比如：div.content > p 查詢 p
57 str = doc.select("div.mini-left ul > li").text();
58 
59 // parent > * 查詢某個父元素下所有直接子元素
60 Elements elements = doc.select("div.mini-left > *");
61 for (Element ele : elements) {
62     System.out.println(ele.tagName());
63 }

Jsoup入門學習一

Jsoup入門學習一

HttpClient入門學習一

Epplus c# to excel 的入門學習(一)

【C++入門學習筆記】函式和物件！你需要這一篇文章入門C++！

記一次flink入門學習筆記

android入門學習-天氣預報app（一）

flask入門學習（一）

Mybatis-Plus入門學習筆記（一）

opencv學習(一)——影象入門

springboot學習一簡單入門

python入門學習篇十一

【SpringBoot學習一】開發入門--快速建立springboot程式

Hadoop入門學習筆記（一）

c語言入門這一篇就夠了-學習筆記(一萬字)

學習一波Vue3新特性

RabbitMQ入門（一）—— CentOS7 搭建 RabbitMQ

入門學習Linux常用必會60個命令例項詳解

mongodb資料庫入門學習筆記之下載、安裝、啟動、連線操作解析

python列表推導式入門學習解析

九、Mysql資料庫--基礎入門（一）

Jsoup入門學習一

相關推薦