Jsoup抓取圖片
阿新 • • 發佈:2018-12-22
楔子
jsoup抓取圖片,其實就是分析頁面圖片地址,然後下載圖片
demo
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection. Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @Title: DownPic.java
* @Package com.pic
* @Description: TODO(用一句話描述該檔案做什麼)
* @author 作者 grq
* @version 建立時間:2018年12月2日 下午9:24:03
*
*/
public class DownPic {
static String mainUrl = "https://www.meitulu.com/t/toutiaonvshen/";
File saveFile = new File("c://picc");
public static void main(String[] args) throws IOException {
for (int i = 1; i < 10; i++) {
String pageUrl = "";
final int pageNum = i;
if (i == 1) {
pageUrl = mainUrl;
} else {
pageUrl = mainUrl + i + ".html";
}
parseMinPage(pageUrl, "page_" + pageNum);
}
}
private static void parseMinPage(String url, String pageNum) {
try {
Document document = Jsoup.connect(url).get();
Elements imgLis = document.getElementsByClass("img");
// 從li中獲取第一個a標籤
Elements elementsByTag = Jsoup.parse(imgLis.toString()).getElementsByTag("li");
for (Element element : elementsByTag) {
Element child = element.child(0);
Elements allElements = child.getAllElements();
String picURL = allElements.get(0).attr("href");
String attr = allElements.get(1).attr("alt");
// 圖片 數量
String picCount = element.child(1).text().substring("圖片: ".length(), "圖片: ".length() + 2);
downDetail(picURL, pageNum + "/" + attr.replaceAll(" ", ""), picCount.trim());
System.out.println("down pic is " + attr + " 地址是:" + picURL);
}
} catch (IOException e) {
System.out.println("主頁連線超時");
e.printStackTrace();
}
}
/**
* 在主頁下載圖片
*
* @param picURL
* @param title
* @param picCount
* @throws IOException
*/
private static void downDetail(String picURL, String title, String picCount) throws IOException {
String baseUrl = picURL.substring(0, picURL.length() - 5);
for (int i = 1; i <= (Integer.valueOf(picCount) + 3) / 4; i++) {
if (i == 1) {
picURL = baseUrl + ".html";
} else {
picURL = baseUrl + "_" + i + ".html";
}
Elements pics = Jsoup.connect(picURL).get().getElementsByClass("content");
Elements picImg = pics.get(0).getElementsByTag("img");
for (Element ele : picImg) {
downPic(ele.attr("src"), title);
}
}
}
private static void downPic(String picUrl, String title) throws IOException {
// https://www.meitulu.com/t/toutiaonvshen/
Connection connect = Jsoup.connect(picUrl);
Response execute = connect.referrer("https://www.meitulu.com/").ignoreContentType(true).execute();
String name = FilenameUtils.getName(picUrl);
BufferedInputStream bodyStream = execute.bodyStream();
FileUtils.copyInputStreamToFile(bodyStream, new File("c:/piccc/" + title, name));
}
}
pom
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>