Jsoup初接觸-發一個Jsoup抓取圖片的程式
阿新 • • 發佈:2019-02-19
主要有兩個執行緒:圖片url抓取執行緒、圖片下載儲存執行緒。
圖片下載儲存採用執行緒池處理,主要利用java的ThreadPoolExecutor實現。
url抓取執行緒:
package sys.gifspider; import java.io.IOException; import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sys.gifspider.utils.PropertyUtil; public class GifSpider implements Runnable { volatile boolean isRunning = true; private ThreadPoolExecutor threadPool; BlockingQueue<String> queue; public GifSpider(BlockingQueue<String> queue) { this.queue = queue; this.init(); } /** * 執行緒池初始化 */ private void init() { Properties pro = PropertyUtil.getProperties(); int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize")); int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize")); int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds")); int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity")); BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap); this.threadPool = new ThreadPoolExecutor( corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, queue); } public boolean isRunning() { return isRunning; } public void setRunning(boolean isRunning) { this.isRunning = isRunning; } @Override public void run() { while (this.isRunning) { try { String url = this.queue.take(); System.out.println("請求url:" + url); Document doc = Jsoup.connect(url).get(); //獲取所有<a href> Elements s = doc.select("div.pic_list2").first().select("a[href]"); for (Element e : s) { //有img 和 文字 兩種href,指向相同德圖片,只過濾圖片href就行了 Elements s1 = e.select("img"); if (s1.size() != 0) { String imgUrl = e.absUrl("href"); String text = s1.attr("alt"); Document doc1 = Jsoup.connect(imgUrl).get(); Elements e1 = doc1.getElementById("endtext").select("img"); //網頁原始碼中是相對路徑,要獲取絕對路徑 String realUrl = e1.attr("abs:src"); System.out.println("獲取圖片url:" + realUrl); //獲取到圖片url,扔給執行緒池處理 GifProcessor pro = new GifProcessor(text,realUrl); this.threadPool.execute(pro); } } Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } }
圖片處理執行緒很簡單,就是圖片下載和儲存:
下載儲存:package sys.gifspider; import sys.gifspider.utils.FileProcessor; public class GifProcessor implements Runnable { private String imgName; private String imgUrl; public GifProcessor(String name,String url) { this.imgName = name; this.imgUrl = url; } @Override public void run() { FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl); try { System.out.println("下載儲存圖片url:"+this.imgUrl); fp.saveGif(); }catch(Exception e) { System.out.println("下載儲存圖片失敗,url:"+this.imgUrl); e.printStackTrace(); } } }
package sys.gifspider.utils; import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; public class FileProcessor { private String imgName; private String imgUrl; public FileProcessor(String name,String url) { this.imgName = name; this.imgUrl = url; } /** * 儲存路徑,不存在就建立 * @return */ private String makeDir() { String strdir = PropertyUtil.getProperties().getProperty("dir"); File dir = new File(strdir); if (!dir.exists()) { dir.mkdir(); } return strdir; } /** * 儲存 * @throws Exception */ public void saveGif() throws Exception { String dir = makeDir(); String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf(".")); BufferedOutputStream out = null; byte[] bit = this.download(); if (bit.length > 0) { try { out = new BufferedOutputStream(new FileOutputStream(file)); out.write(bit); out.flush(); } finally { if (out != null) out.close(); } } } /** * 下載 * @return * @throws Exception */ private byte[] download() throws Exception { URL url = new URL(this.imgUrl); HttpURLConnection httpConn = (HttpURLConnection) url.openConnection(); httpConn.connect(); InputStream cin = httpConn.getInputStream(); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = 0; while ((len = cin.read(buffer)) != -1) { outStream.write(buffer, 0, len); } cin.close(); byte[] fileData = outStream.toByteArray(); outStream.close(); return fileData; } }
程式入口如下:
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class Main
{
public static void main(String[] args)
{
init();
}
public static void init()
{
Properties pro = PropertyUtil.getProperties();
int startPage = Integer.parseInt(pro.getProperty("startPage"));
int endPage = Integer.parseInt(pro.getProperty("endPage"));
String url = pro.getProperty("url");
int count = endPage - startPage +1;
BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
for (int i = 1; i <= count; i++)
{
queue.add(String.format(url, i));
}
int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
for (int i = 0; i < spiderCount; i++)
{
GifSpider spider = new GifSpider(queue);
Thread t = new Thread(spider);
t.start();
}
}
}
配置檔案:
spiderThread=1
threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000
startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm
dir=E:/spider/
用haha365的動態gif做了下測試,如果想趴別的網站,自己根據人家的html結構,改一下爬取規則即可。
程式中沒做過多的容錯處理,可能存在一定的bug。