1. 程式人生 > >爬蟲記錄(2)——簡單爬取一個頁面的圖片並儲存

爬蟲記錄(2)——簡單爬取一個頁面的圖片並儲存

1、爬蟲工具類,用來獲取網頁內容

package com.dyw.crawler.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

/**
 * 爬蟲工具類
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils { /** * 獲取html內容轉成string輸出。 * * @param url url連結 * @return 整個網頁轉成String字串 */ public static String getHtml(String url) throws Exception { URL url1 = new URL(url);//使用java.net.URL URLConnection connection = url1.openConnection();//開啟連結
InputStream in = connection.getInputStream();//獲取輸入流 InputStreamReader isr = new InputStreamReader(in);//流的包裝 BufferedReader br = new BufferedReader(isr); String line; StringBuffer sb = new StringBuffer(); while ((line = br.readLine()) != null) {//整行讀取 sb.append(line, 0
, line.length());//新增到StringBuffer中 sb.append('\n');//新增換行符 } //關閉各種流,先宣告的後關閉 br.close(); isr.close(); in.close(); return sb.toString(); } /** * 下載檔案流 * @param urlStr url地址 * @return InputStream */ public static InputStream downLoadFromUrl(String urlStr) throws IOException { URL url = new URL(urlStr); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); //防止遮蔽程式抓取而返回403錯誤 conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); //設定超時間為3秒 conn.setConnectTimeout(3 * 1000); conn.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*"); conn.setRequestProperty("Accept-Language", "zh-cn"); conn.setRequestProperty("UA-CPU", "x86"); conn.setRequestProperty("Accept-Encoding", "gzip");//為什麼沒有deflate呢 conn.setRequestProperty("Content-type", "text/html"); conn.setRequestProperty("Connection", "keep-alive"); //得到輸入流 return conn.getInputStream(); } }

2、正則工具類,用來匹配需要獲取的url地址

package com.dyw.crawler.util;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 正則表示式工具類
 * Created by dyw on 2017/9/1.
 */
public class RegularUtils {
    //獲取img標籤正則
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    //獲取href正則
    private static final String AURL_REG = "href=\"(.*?)\"";
    //獲取http開頭,png|jpg|bmp|gif結尾的 正則
    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";

    /**
     * 獲取 A 標籤的正則表示式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getAUrl(String html) {
        return match(AURL_REG, html);
    }

    /**
     * 獲取 IMG 標籤的正則表示式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getIMGUrl(String html) {
        List<String> imgUrl = match(IMGURL_REG, html);
        return match(IMGSRC_REG, imgUrl);
    }
    /**
     * 獲取 A 標籤的正則表示式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getIMGSrc(String html) {
        return match(IMGSRC_REG, html);
    }

    /**
     * String匹配正則,封裝到list中
     *
     * @param regular 正則表示式
     * @param html    匹配的內容
     * @return 匹配到的結果 List
     */
    private static List<String> match(String regular, String html) {
        Matcher matcher = Pattern.compile(regular).matcher(html);
        List<String> list = new ArrayList<>();
        while (matcher.find()) {
            list.add(matcher.group());
        }
        return list;
    }

    /**
     * list匹配正則,封裝到list中
     *
     * @param regular 正則表示式
     * @param list    匹配的列表
     * @return 匹配到的結果 List
     */
    private static List<String> match(String regular, List<String> list) {
        List<String> result = new ArrayList<>();
        list.forEach(string -> {
            Matcher matcher = Pattern.compile(regular).matcher(string);
            while (matcher.find()) {
                result.add(matcher.group());
            }
        });
        return result;
    }
}

3、IO工具類,用來把獲取的html內容進行寫入到檔案中

package com.dyw.crawler.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * IO工具類
 * Created by dyw on 2017/9/1.
 */
public class IOUtils {

    /**
     * 建立檔案
     *
     * @param file File型別
     */
    public static void createFile(File file) throws Exception {
        try {
            if (!file.exists()) {
                file.createNewFile();
            }
        } catch (Exception e) {
            throw new Exception("建立檔案的時候錯誤!", e);
        }
    }

    /**
     * 寫入String到file中
     *
     * @param content  寫入內容
     * @param fileName 寫入位置
     */
    public static void writeFile(String content, File fileName) throws Exception {
        writeFile(content.getBytes("Utf-8"), fileName);
    }

    /**
     * 寫入bytes到file中
     *
     * @param bytes    寫入內容
     * @param fileName 寫入位置
     */
    public static void writeFile(byte[] bytes, File fileName) throws Exception {
        FileOutputStream o;
        try {
            o = new FileOutputStream(fileName);
            o.write(bytes);
            o.close();
        } catch (Exception e) {
            throw new Exception("寫入檔案的時候錯誤!", e);
        }
    }

    /**
     * 儲存inputStream到檔案
     *
     * @param inputStream 輸入流
     * @param fileName    儲存檔案的位置
     */
    public static void saveFile(InputStream inputStream, File fileName) throws Exception {
        writeFile(readInputStream(inputStream), fileName);
    }

    /**
     * 從輸入流中獲取位元組陣列
     *
     * @param inputStream 輸入流
     * @return byte陣列
     */
    private static byte[] readInputStream(InputStream inputStream) throws IOException {
        byte[] buffer = new byte[1024];
        int len = 0;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        while ((len = inputStream.read(buffer)) != -1) {
            bos.write(buffer, 0, len);
        }
        bos.close();
        inputStream.close();
        return bos.toByteArray();
    }
}

4、main方法執行

package com.dyw.crawler.project;

import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;

import java.io.File;
import java.io.InputStream;
import java.util.List;

/**
 * 下載網頁中的圖片
 * Created by dyw on 2017/9/4.
 */
public class Project1 {
    public static void main(String[] args) {
        //檔案放置的路徑
        String path = "C:\\Users\\dyw\\Desktop\\crawler";
        //爬取的網站地址
        String url = "http://blog.csdn.net/juewang_love";
        //獲取內容
        String htmlContent = null;
        try {
            htmlContent = CrawlerUtils.getHtml(url);
        } catch (Exception e) {
            throw new RuntimeException("獲取內容失敗!", e);
        }
        //獲取所有的img的內容
        List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
        //分別下載每個img
        imgUrls.forEach(imgUrl -> {
            String[] split = imgUrl.split("/");
            String imgName = split[split.length - 1];
            try {
                File file1 = new File(path + "/" + imgName);
                InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
                IOUtils.saveFile(inputStream, file1);
                System.out.println("success:" + imgName);
            } catch (Exception e) {
                System.out.println("fail:" + imgUrl + "" + imgName);
            }
        });
    }
}

5、修改 CrawlerUtils 工具類 用 httpclient 替代 urlConnection

package com.dyw.crawler.util;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
 * 爬蟲工具類
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils {

    /**
     * http請求設定訊息頭
     *
     * @param httpMethod http請求方法
     */
    private static void setHead(HttpMethod httpMethod) {
        httpMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        httpMethod.setRequestHeader("Content-Type", "Utf-8");
        httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    }

    /**
     * 獲取html內容轉成string輸出(get方法)
     *
     * @param url url連結
     * @return 整個網頁轉成String字串
     */
    public static String getHtml(String url) throws Exception {
        InputStream inputStream = downLoadFromUrl(url);
        BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "Utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String str;
        while ((str = br.readLine()) != null) {
            stringBuffer.append(str);
            stringBuffer.append('\n');//新增換行符
        }
        return stringBuffer.toString();
    }

    /**
     * 獲取檔案流(get方法)
     *
     * @param urlStr url地址
     * @return InputStream
     */
    public static InputStream downLoadFromUrl(String urlStr) throws IOException {
        //通過httpclient來代替urlConnection
        HttpClient httpClient = new HttpClient();
        HttpMethod httpMethod = new GetMethod(urlStr);
        setHead(httpMethod);
        int status = httpClient.executeMethod(httpMethod);
        InputStream responseBodyAsStream = null;
        if (status == HttpStatus.SC_OK) {
            responseBodyAsStream = httpMethod.getResponseBodyAsStream();
        }
        return responseBodyAsStream;
    }
}