1. 程式人生 > >Jsoup初接觸-發一個Jsoup抓取圖片的程式

Jsoup初接觸-發一個Jsoup抓取圖片的程式

主要有兩個執行緒:圖片url抓取執行緒、圖片下載儲存執行緒。

圖片下載儲存採用執行緒池處理,主要利用java的ThreadPoolExecutor實現。

url抓取執行緒:

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class GifSpider implements Runnable
{

  volatile boolean isRunning = true;
  private ThreadPoolExecutor threadPool;
  BlockingQueue<String> queue;
  
  public GifSpider(BlockingQueue<String> queue)
  {
    this.queue = queue;
    this.init();
  }
  
  /**
   * 執行緒池初始化
   */
  private void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize"));
    int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize"));
    int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds"));
    int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity"));
    BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap);
    this.threadPool = new ThreadPoolExecutor(
        corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, 
        queue);
  }
  public boolean isRunning()
  {
    return isRunning;
  }

  public void setRunning(boolean isRunning)
  {
    this.isRunning = isRunning;
  }

  @Override
  public void run()
  {
    while (this.isRunning)
    {
      try
      {
        
        String url = this.queue.take();
        System.out.println("請求url:" + url);
        Document doc = Jsoup.connect(url).get();
        //獲取所有<a href>
        Elements s = doc.select("div.pic_list2").first().select("a[href]");
        for (Element e : s)
        {
          //有img 和  文字 兩種href,指向相同德圖片,只過濾圖片href就行了
          Elements s1 = e.select("img");
          if (s1.size() != 0)
          {
            String imgUrl = e.absUrl("href");
            String text = s1.attr("alt");
            Document doc1 = Jsoup.connect(imgUrl).get();
            Elements e1 = doc1.getElementById("endtext").select("img");
            //網頁原始碼中是相對路徑,要獲取絕對路徑
            String realUrl = e1.attr("abs:src");
            System.out.println("獲取圖片url:" + realUrl);
            //獲取到圖片url,扔給執行緒池處理
            GifProcessor pro = new GifProcessor(text,realUrl);
            this.threadPool.execute(pro);
          }
          
        }
        Thread.sleep(1000);
      } catch (InterruptedException e)
      {
        e.printStackTrace();
      } catch (IOException e)
      {
        e.printStackTrace();
      }
    }
    
  }
  
}

圖片處理執行緒很簡單,就是圖片下載和儲存:
package sys.gifspider;

import sys.gifspider.utils.FileProcessor;

public class GifProcessor implements Runnable
{

  private String imgName;
  private String imgUrl;
  
  public GifProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  @Override
  public void run()
  {
    FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);
    try
    {
      System.out.println("下載儲存圖片url:"+this.imgUrl);
      fp.saveGif();
      
    }catch(Exception e)
    {
      System.out.println("下載儲存圖片失敗,url:"+this.imgUrl);
      e.printStackTrace();
    }
    
  }
  
}
下載儲存:
package sys.gifspider.utils;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class FileProcessor
{
  private String imgName;
  private String imgUrl;
  
  public FileProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  
  /**
   * 儲存路徑,不存在就建立
   * @return
   */
  private String makeDir()
  {
    String strdir = PropertyUtil.getProperties().getProperty("dir");
    File dir = new File(strdir);
    if (!dir.exists())
    {
      dir.mkdir();
    }
    return strdir;
  }
  
  /**
   * 儲存
   * @throws Exception
   */
  public void saveGif() throws Exception
  {
    String dir = makeDir();
    String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf("."));
    BufferedOutputStream out = null;
    byte[] bit = this.download();
    if (bit.length > 0)
    {
      try
      {
        out = new BufferedOutputStream(new FileOutputStream(file));
        out.write(bit);
        out.flush();
      } finally
      {
        if (out != null)
          out.close();
      }
    }
  }
  /**
   * 下載
   * @return
   * @throws Exception
   */
  private byte[] download() throws Exception  
  {
    URL url = new URL(this.imgUrl);  
    HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();  
    httpConn.connect();  
    InputStream cin = httpConn.getInputStream();  
    ByteArrayOutputStream outStream = new ByteArrayOutputStream();  
    byte[] buffer = new byte[1024];  
    int len = 0;  
    while ((len = cin.read(buffer)) != -1) {  
        outStream.write(buffer, 0, len);  
    }  
    cin.close();  
    byte[] fileData = outStream.toByteArray();  
    outStream.close();  
    return fileData;  
  }
}

程式入口如下:
package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class Main
{
  public static void main(String[] args)
  {
    init();
    
  }
  public static void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int startPage = Integer.parseInt(pro.getProperty("startPage"));
    int endPage = Integer.parseInt(pro.getProperty("endPage"));
    String url = pro.getProperty("url");
    int count = endPage - startPage +1;
    BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
    for (int i = 1; i <= count; i++)
    {
      queue.add(String.format(url, i));
    }
    int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
    for (int i = 0; i < spiderCount; i++)
    {
      GifSpider spider = new GifSpider(queue);
      Thread t = new Thread(spider);
      t.start();
    }
  }
  
}

配置檔案:
spiderThread=1

threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000

startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm

dir=E:/spider/

用haha365的動態gif做了下測試,如果想趴別的網站,自己根據人家的html結構,改一下爬取規則即可。

程式中沒做過多的容錯處理,可能存在一定的bug。

原始碼下載