1. 程式人生 > >jsoup爬取指定網頁的url和圖片

jsoup爬取指定網頁的url和圖片

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;

/**
 * Created by Administrator on 2016/3/10.
 */
public class Crawler {

    private static final Object signal=new Object();
    private static  int count=0;

    public static String getRequestByURL(String url){
        System.out.println("開啟網頁----");
        try {
            String entity = TestHTTPClient.get(url);
            return entity;
        } catch (Exception e) {
            System.out.println("開啟網頁出錯---");
            e.printStackTrace();
            return null;
        }
    }

    public static List<String> getArticleURLs(String pageContext){
        List<String> articleURLs=new ArrayList<String>();
        System.out.println("尋找專題");
        Document document = Jsoup.parseBodyFragment(pageContext);
        Elements elements = document.select("span.link_title");
        elements=elements.select("a");
        for (Element element : elements) {
            articleURLs.add(element.attr("href"));
        }
        return articleURLs;
    }


    public static List<String> getImgURLS(String pageContext){
        System.out.println("開始查詢圖片");
        List<String> imgURLs=new ArrayList<String>();
        Document document = Jsoup.parseBodyFragment(pageContext);
        Elements elements = document.select("a[target=_blank] img[src]");
        for (Element element : elements) {
            imgURLs.add(element.attr("src"));
        }
        return imgURLs;
    }
    public static void savePic(String imgURL,String imgPath){
        if (imgURL==null){
            return;
        }
        String[] str = imgURL.split("/");
        String fileName = str[str.length - 1];
        String savePath=imgPath+ File.separator+fileName;
        HttpGet httpGet=new HttpGet(imgURL);
        CloseableHttpClient httpClient = TestHTTPClient.getHttpClient();
        try {
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity entity = httpResponse.getEntity();
            InputStream inputStream = entity.getContent();
            OutputStream outputStream=new FileOutputStream(savePath);
            IOUtils.copy(inputStream,outputStream);

            IOUtils.closeQuietly(inputStream);
            IOUtils.closeQuietly(outputStream);
            System.out.println("儲存圖片成功!");
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("圖片儲存失敗!");
        }


    }

    public static void begin(final String pageContext){
        for (int i=0;i<10;i++){
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (true){
                        System.out.println("當前進入執行緒的是:"+Thread.currentThread().getName());
                        List<String> imgURLS = getImgURLS(pageContext);
                        if (imgURLS!=null&&imgURLS.size()>0){

                        }

                    }

                }
            });
        }
    }

    public static void main(String[] args) {
//        爬取個人csdn目錄檢視所有文章
//        String url="http://blog.csdn.net/huxiweng/article/list/";
//        int maxPage=6;
//        for (int i=0;i<maxPage;i++){
//            String pageContext= getRequestByURL(url+""+(i+1));
//            System.out.println("開始尋找第"+(i+1)+"頁面文章");
//            List<String> articleURLs = getArticleURLs(pageContext);
//            for (String articleURL : articleURLs) {
//                System.out.println(articleURL);
//            }
//        }

        //爬取火狐瀏覽器圖片頻道的圖片
        final String imgPath="E:/img";
        File file=new File(imgPath);
        if (!file.exists()){
            file.mkdir();
        }
        String url="http://photo.firefox.163.com/";
        System.out.println("begin");
        long begin = System.currentTimeMillis();
        for (int i=0;i<10;i++){
            System.out.println("爬取第"+i+1+"圖片");
            String pageContext = getRequestByURL(url+"roll_"+(i+1)+".html");
            List<String> imgURLS = getImgURLS(pageContext);
            for (String imgURL : imgURLS) {
                savePic(imgURL,imgPath);
//            System.out.println(imgURL);
            }
        }


        System.out.println("耗時:"+String.valueOf(System.currentTimeMillis()-begin));


    }
}
其中TestHttpClient用了上一篇文章的類!