Java 爬蟲 爬取html網頁解析
阿新 • • 發佈:2020-12-26
技術標籤:java
1、springboot專案,引入jsoup
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2、準備解析物件
Content.java
package com.asia.pojo; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; //java專案 www.fhadmin.org @Data @NoArgsConstructor @AllArgsConstructor public class Content { private String title; private String img; private String price; }
3、爬蟲工具類
HtmlParseUtil.java
package com.asia.utils; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.asia.pojo.Content; //java專案 www.fhadmin.org public class HtmlParseUtil { public static void main(String[] args) throws Exception { new HtmlParseUtil().parseJD("西瓜").forEach(System.out::println); } public List<Content> parseJD(String keywords) throws Exception { String url = "https://search.jd.com/Search?keyword=" + URLDecoder.decode(keywords, "GBK"); // 解析網頁.(Jsoup返回Document就是瀏覽器的Document物件) Document document = Jsoup.parse((new URL(url)), 30000); Element element = document.getElementById("J_goodsList"); Elements elements = element.getElementsByTag("li"); List<Content> list = new ArrayList<Content>(); for (Element el : elements) { String src = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String price = el.getElementsByClass("p-price").eq(0).text(); String name = el.getElementsByClass("p-name").eq(0).text(); list.add(new Content(name, src, price)); } return list; } }