爬蟲記錄(4)——多執行緒爬取圖片並下載
阿新 • • 發佈:2019-02-01
還是繼續前幾篇文章的程式碼。
當我們需要爬取的圖片量級比較大的時候,就需要多執行緒爬取下載了。這裡我們用到forkjoin pool來處理併發。
1、DownloadTask下載任務類
package com.dyw.crawler.util;
import java.io.File;
import java.io.InputStream;
import java.util.List;
import java.util.concurrent.RecursiveAction;
/**
* forkJoin pool 並行處理下載圖片
* Created by dyw on 2017/9/7.
*/
public class DownloadTask extends RecursiveAction {
//每個任務總數
private static final int THRESHOLD = 8;
//傳入的所有的url的列表
private List<String> urls;
//開始座標
private int start;
//結束座標
private int end;
//儲存路徑
private String path;
/**
* @param urls url集合
* @param start 開始座標
* @param end 結束座標
* @param path 儲存路徑
*/
public DownloadTask(List<String> urls, int start, int end, String path) {
this.urls = urls;
this.start = start;
this.end = end;
this.path = path;
}
@Override
protected void compute () {
if (end - start < THRESHOLD) {
for (int i = start; i < end; i++) {
String url = urls.get(i);
String[] split = url.split("/");
String imgName = split[split.length - 1];
try {
//檔案儲存
File file = new File(path + "/" + imgName);
InputStream inputStream = CrawlerUtils.downLoadFromUrl(url);
IOUtils.saveFile(inputStream, file);
System.out.println("success:" + url);
} catch (Exception e) {
System.out.println("fail:" + url);
}
}
} else {
// 如果當end與start之間的差大於THRESHOLD時,將大任務分解成兩個小任務。
int middle = (start + end) / 2;
DownloadTask left = new DownloadTask(urls, start, middle, path);
DownloadTask right = new DownloadTask(urls, middle, end, path);
// 並行執行兩個“小任務”
left.fork();
right.fork();
}
}
}
2、main主方法
package com.dyw.crawler.project;
import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.DownloadTask;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
/**
* 多執行緒下載圖片
* Created by dyw on 2017/9/7.
*/
public class Project3 {
public static void main(String[] args) {
ForkJoinPool forkJoinPool = new ForkJoinPool();
String path = "C:\\Users\\dyw\\Desktop\\crawler\\photo";
String path1 = "C:\\Users\\dyw\\Desktop\\crawler\\photo1";
String url = "http://www.tuigirlba.cc/page/show/";
List<String> list = new ArrayList<>();
try {
for (int i = 330; i < 380; i++) {
String htmlContent = CrawlerUtils.get(url + i);
List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
list.addAll(imgUrls);
}
long l = System.currentTimeMillis();
forkJoinPool.execute(new DownloadTask(list, 0, list.size(), path));
forkJoinPool.shutdown();
//等待 forkJoinPool 20秒
forkJoinPool.awaitTermination(20, TimeUnit.SECONDS);
long l1 = System.currentTimeMillis() - l;
long l2 = System.currentTimeMillis();
//for迴圈下載
list.forEach(imgUrl -> {
String[] split = imgUrl.split("/");
String imgName = split[split.length - 1];
try {
File file1 = new File(path1 + "/" + imgName);
InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
IOUtils.saveFile(inputStream, file1);
System.out.println("success:" + imgUrl);
} catch (Exception e) {
System.out.println("fail:" + imgUrl);
}
});
long l3 = System.currentTimeMillis() - l2;
System.out.println("forkjoin處理時間:"+l1);
System.out.println("沒有並行處理時間:"+l3);
} catch (Exception e) {
throw new RuntimeException("獲取內容失敗!", e);
}
}
}
3、執行結果
從下面2個圖片中可以看到,比同步的快很多!
如果有什麼程式碼修改的建議,請給我留言唄! ☺☺☺