爬蟲記錄(2)——簡單爬取一個頁面的圖片並儲存
阿新 • • 發佈:2019-02-08
1、爬蟲工具類,用來獲取網頁內容
package com.dyw.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
/**
* 爬蟲工具類
* Created by dyw on 2017/9/1.
*/
public class CrawlerUtils {
/**
* 獲取html內容轉成string輸出。
*
* @param url url連結
* @return 整個網頁轉成String字串
*/
public static String getHtml(String url) throws Exception {
URL url1 = new URL(url);//使用java.net.URL
URLConnection connection = url1.openConnection();//開啟連結
InputStream in = connection.getInputStream();//獲取輸入流
InputStreamReader isr = new InputStreamReader(in);//流的包裝
BufferedReader br = new BufferedReader(isr);
String line;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) {//整行讀取
sb.append(line, 0 , line.length());//新增到StringBuffer中
sb.append('\n');//新增換行符
}
//關閉各種流,先宣告的後關閉
br.close();
isr.close();
in.close();
return sb.toString();
}
/**
* 下載檔案流
* @param urlStr url地址
* @return InputStream
*/
public static InputStream downLoadFromUrl(String urlStr) throws IOException {
URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//防止遮蔽程式抓取而返回403錯誤
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//設定超時間為3秒
conn.setConnectTimeout(3 * 1000);
conn.setRequestProperty("Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
conn.setRequestProperty("Accept-Language", "zh-cn");
conn.setRequestProperty("UA-CPU", "x86");
conn.setRequestProperty("Accept-Encoding", "gzip");//為什麼沒有deflate呢
conn.setRequestProperty("Content-type", "text/html");
conn.setRequestProperty("Connection", "keep-alive");
//得到輸入流
return conn.getInputStream();
}
}
2、正則工具類,用來匹配需要獲取的url地址
package com.dyw.crawler.util;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正則表示式工具類
* Created by dyw on 2017/9/1.
*/
public class RegularUtils {
//獲取img標籤正則
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
//獲取href正則
private static final String AURL_REG = "href=\"(.*?)\"";
//獲取http開頭,png|jpg|bmp|gif結尾的 正則
private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";
/**
* 獲取 A 標籤的正則表示式
*
* @param html 匹配的內容
* @return List結果集
*/
public static List<String> getAUrl(String html) {
return match(AURL_REG, html);
}
/**
* 獲取 IMG 標籤的正則表示式
*
* @param html 匹配的內容
* @return List結果集
*/
public static List<String> getIMGUrl(String html) {
List<String> imgUrl = match(IMGURL_REG, html);
return match(IMGSRC_REG, imgUrl);
}
/**
* 獲取 A 標籤的正則表示式
*
* @param html 匹配的內容
* @return List結果集
*/
public static List<String> getIMGSrc(String html) {
return match(IMGSRC_REG, html);
}
/**
* String匹配正則,封裝到list中
*
* @param regular 正則表示式
* @param html 匹配的內容
* @return 匹配到的結果 List
*/
private static List<String> match(String regular, String html) {
Matcher matcher = Pattern.compile(regular).matcher(html);
List<String> list = new ArrayList<>();
while (matcher.find()) {
list.add(matcher.group());
}
return list;
}
/**
* list匹配正則,封裝到list中
*
* @param regular 正則表示式
* @param list 匹配的列表
* @return 匹配到的結果 List
*/
private static List<String> match(String regular, List<String> list) {
List<String> result = new ArrayList<>();
list.forEach(string -> {
Matcher matcher = Pattern.compile(regular).matcher(string);
while (matcher.find()) {
result.add(matcher.group());
}
});
return result;
}
}
3、IO工具類,用來把獲取的html內容進行寫入到檔案中
package com.dyw.crawler.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* IO工具類
* Created by dyw on 2017/9/1.
*/
public class IOUtils {
/**
* 建立檔案
*
* @param file File型別
*/
public static void createFile(File file) throws Exception {
try {
if (!file.exists()) {
file.createNewFile();
}
} catch (Exception e) {
throw new Exception("建立檔案的時候錯誤!", e);
}
}
/**
* 寫入String到file中
*
* @param content 寫入內容
* @param fileName 寫入位置
*/
public static void writeFile(String content, File fileName) throws Exception {
writeFile(content.getBytes("Utf-8"), fileName);
}
/**
* 寫入bytes到file中
*
* @param bytes 寫入內容
* @param fileName 寫入位置
*/
public static void writeFile(byte[] bytes, File fileName) throws Exception {
FileOutputStream o;
try {
o = new FileOutputStream(fileName);
o.write(bytes);
o.close();
} catch (Exception e) {
throw new Exception("寫入檔案的時候錯誤!", e);
}
}
/**
* 儲存inputStream到檔案
*
* @param inputStream 輸入流
* @param fileName 儲存檔案的位置
*/
public static void saveFile(InputStream inputStream, File fileName) throws Exception {
writeFile(readInputStream(inputStream), fileName);
}
/**
* 從輸入流中獲取位元組陣列
*
* @param inputStream 輸入流
* @return byte陣列
*/
private static byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
inputStream.close();
return bos.toByteArray();
}
}
4、main方法執行
package com.dyw.crawler.project;
import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;
import java.io.File;
import java.io.InputStream;
import java.util.List;
/**
* 下載網頁中的圖片
* Created by dyw on 2017/9/4.
*/
public class Project1 {
public static void main(String[] args) {
//檔案放置的路徑
String path = "C:\\Users\\dyw\\Desktop\\crawler";
//爬取的網站地址
String url = "http://blog.csdn.net/juewang_love";
//獲取內容
String htmlContent = null;
try {
htmlContent = CrawlerUtils.getHtml(url);
} catch (Exception e) {
throw new RuntimeException("獲取內容失敗!", e);
}
//獲取所有的img的內容
List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
//分別下載每個img
imgUrls.forEach(imgUrl -> {
String[] split = imgUrl.split("/");
String imgName = split[split.length - 1];
try {
File file1 = new File(path + "/" + imgName);
InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
IOUtils.saveFile(inputStream, file1);
System.out.println("success:" + imgName);
} catch (Exception e) {
System.out.println("fail:" + imgUrl + "" + imgName);
}
});
}
}
5、修改 CrawlerUtils 工具類 用 httpclient 替代 urlConnection
package com.dyw.crawler.util;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* 爬蟲工具類
* Created by dyw on 2017/9/1.
*/
public class CrawlerUtils {
/**
* http請求設定訊息頭
*
* @param httpMethod http請求方法
*/
private static void setHead(HttpMethod httpMethod) {
httpMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
httpMethod.setRequestHeader("Content-Type", "Utf-8");
httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
}
/**
* 獲取html內容轉成string輸出(get方法)
*
* @param url url連結
* @return 整個網頁轉成String字串
*/
public static String getHtml(String url) throws Exception {
InputStream inputStream = downLoadFromUrl(url);
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "Utf-8"));
StringBuffer stringBuffer = new StringBuffer();
String str;
while ((str = br.readLine()) != null) {
stringBuffer.append(str);
stringBuffer.append('\n');//新增換行符
}
return stringBuffer.toString();
}
/**
* 獲取檔案流(get方法)
*
* @param urlStr url地址
* @return InputStream
*/
public static InputStream downLoadFromUrl(String urlStr) throws IOException {
//通過httpclient來代替urlConnection
HttpClient httpClient = new HttpClient();
HttpMethod httpMethod = new GetMethod(urlStr);
setHead(httpMethod);
int status = httpClient.executeMethod(httpMethod);
InputStream responseBodyAsStream = null;
if (status == HttpStatus.SC_OK) {
responseBodyAsStream = httpMethod.getResponseBodyAsStream();
}
return responseBodyAsStream;
}
}