爬蟲基礎之Jsoup解析HTML
阿新 • • 發佈:2018-12-14
Jsoup的Maven座標
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
Jsoup解析HTML得到Document的幾種方式:
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.File; import java.io.IOException; /** * 解析HTML的DOM資料 */ public class JsoupDom { public static void main(String[] args) throws IOException { String html = "<!DOCTYPE html>\n" + "<html lang=\"en\">\n" + "<head>\n" + " <meta charset=\"UTF-8\">\n" + " <title>Title</title>\n" + "</head>\n" + "<body>\n" + "\n" + "</body>\n" + "</html>"; //方式一:獲取Document物件 Document document = Jsoup.parse(html); System.out.println(document.title()); //方式二:獲取Document物件 Document document1 = Jsoup.connect("http://www.bingosoft.net").get(); Elements elements = document1.select(".city h3"); System.out.println(elements+",,,"+elements.text()); //方式三:獲取Document物件 // Document document2 = Jsoup.parse(new File("html_path"), "UTF-8"); //方式四:獲取Document物件 String bodyHtml = "<a href='#'>連線</a>"; Document document3 = Jsoup.parseBodyFragment(bodyHtml); System.out.println(document3.text()); } }