實現抓取網頁圖片(JAVA實現)
最近學習網頁設計,想仿網路上的一個頁面,圖片素材一個一個地儲存起來太麻煩。就想著裡利用JAVA來實現一個小小的網頁圖片爬蟲,程式碼很簡單,不一會兒就實現了,但是當我訪問https協議的圖片時,一直報javax.net.ssl.SSLKeyException異常,到我儲存圖片的目錄,http協議的已經爬取下來了,但https的沒有。花了三天,還是一直卡這個異常。一直百度,Google找了很久。看大神們的部落格原始碼,一個個寫的都差不多,我也照著寫了一遍,但是就是沒有找到辦法實現HTTPS的訪問一直出現javax.net.ssl.SSLKeyException。
沒辦法,不能只是copy,找了很多教程發現原來http與https是有區別的。簡單理解就是https是一種加密的http,也就是在HTTP上加入了SSL協議,SSL依靠證書與伺服器進行驗證,從而進行客戶端與伺服器之間的資料通訊加密,……。切入正題,要進行HTTPS的連線就必須驗證證書。但是證書我們沒有,所想要連線就必須獲得證書。獲取網頁證書其實也挺簡單,訪問https網站時,瀏覽器地址視窗會有個綠色小鎖標誌,
檢視證書
點選複製到檔案……,就可以儲存到本地。儲存到本地後在連線https的時候讀取證書檔案作為驗證就可以連線到https了,不過這一方法我沒有寫,因為每抓取一個網站的圖片就要獲取這個網站的證書,還是有點麻煩的。所以我我使用的是下面是第二種方法:跳過證書驗證,即訪問任何HTTPS網站都不需要驗證證書,雖然說不安全,但是對於只是爬取圖片來說,沒差了。廢話不多說上程式碼
- 跳過SSL證書驗證方法
public static SSLSocketFactory createSSL() throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, KeyStoreException, CertificateException, FileNotFoundException, IOException{ TrustManager[] tm =new TrustManager[]{ myTrustManager }; SSLContext sslContext = SSLContext.getInstance("TLS"); sslContext.init(null, tm, null); SSLSocketFactory ssf = sslContext.getSocketFactory(); HttpsURLConnection.setDefaultHostnameVerifier(ignoreHostnameVerifier); HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory()); return ssf; } public static TrustManager myTrustManager = new X509TrustManager() { @Override public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {} @Override public void checkServerTrusted(X509Certificate[] arg0, String arg1){} @Override public X509Certificate[] getAcceptedIssuers() { return null; } };
這就實現了HTTPS連線時的驗證問題。其實網上的程式碼也是如此,但是我就是在這個地方卡了很久,網上搜索的程式碼都是這個型別。但是都沒有提到我接下來遇到的問題。我覺得這個問題雖然很小,但是對於新手來說很致命,就像我一樣,所以我自己需要記下來,順便可以分享給同樣遇到這個問題的朋友。程式碼正確但卻又報javax.net.ssl.SSLKeyException異常。其實關鍵在於Jre中lib/ext資料夾中的檔案,具體為什麼我也不清楚,搜尋問題是有人提到這個資料夾,所以我就試了試。我把這個資料夾的內容匯入到Eclipse專案中,但是問題還是沒有解決,所以我對比了一下java安裝目錄下JDK和JRE,發現JRE/lib包含的東西比JDK中的東西多,好吧其實我也不懂。就報著嘗試的態度把專案中的JAVA執行環境改成JRE,想不到執行可以了,簡直不可思議。在專案中右擊執行方式,英文版的是好像是RunAS,開啟執行配置,選JRE,選擇備用JRE,點選已安裝
點進去後新增(Add)JRE,JRE主目錄填寫JRE安裝的位置(非JDK),完成。
完成後,選中你新增的JRE,點選應用,確認。此時再執行,發現不報錯了。本地也找到爬取的圖片,完成。
下面貼上完整的程式碼:Utils類
package com.get;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
/**
*
* @author 90604
* 工具類
*/
public class Utils {
// 獲取img標籤正則
public static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 獲取src路徑的正則
public static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";
//獲取標籤background
public static final String BACKGROUND_REG =
"url/((/S+?)/)";
//獲取圖片連結(這一個使用,其他沒使用)
public static final String HTTP_IMG = "(http|https)://.+\\.(jpg|gif|png)";
//獲取網頁原始碼
public static String getHtml(String urlString) throws IOException{
URL url = new URL(urlString);
HttpURLConnection hrc = (HttpURLConnection) url.openConnection();
InputStream in = hrc.getInputStream();
String html = Utils.convertStreamToString(in);
return html;
}
//獲取網頁原始碼,利用HttpClient
public static String setImageConnectTool(String url){
String html = null;
RequestConfig globalConfig =
RequestConfig.custom()
.setCookieSpec(CookieSpecs.STANDARD)
.setConnectionRequestTimeout(5000)//設定從connect Manager獲取Connection 超時時間,單位毫秒。這個屬性是新加的屬性,因為目前版本是可以共享連線池的。
.setConnectTimeout(6000)//設定連線超時時間,單位毫秒
.build();
//建立httpClient例項
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultRequestConfig(globalConfig)
.build();
//url代表每張圖片下載地址
HttpGet httpGet = new HttpGet(url);
//建立httpget請求
httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0");
//執行get請求
try {
//獲取get請求
CloseableHttpResponse response = httpClient.execute(httpGet);
//獲取響應實體
HttpEntity entity = response.getEntity();
System.out.println(response.getStatusLine());
InputStream in = entity.getContent(); //得到請求回來的資料
//得到請求到的頁面
Utils.convertStreamToString(in);
} catch (ClientProtocolException e) {
// TODO 自動生成的 catch 塊
e.printStackTrace();
} catch (IOException e) {
// TODO 自動生成的 catch 塊
e.printStackTrace();
}
return html;
}
public static String convertStreamToString(InputStream in) throws UnsupportedEncodingException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuilder sb = new StringBuilder();
String line = null;
String str = System.getProperty("line.separator");
try {
while ((line = reader.readLine()) != null) {
sb.append(line + str);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
//獲取ImageUrl地址
public static List<String> getImageUrl(String html,String reg){
Matcher matcher=Pattern.compile(reg).matcher(html);
List<String>listimgurl=new ArrayList<String>();
while (matcher.find()){
listimgurl.add(matcher.group());
}
return listimgurl;
}
//獲取ImageSrc地址
public List<String> getImageSrc(List<String> listimageurl){
List<String> listImageSrc=new ArrayList<String>();
for (String image:listimageurl){
Matcher matcher=Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()){
listImageSrc.add(matcher.group().substring(0, matcher.group().length()-1));
}
}
return listImageSrc;
}
public static String getImageName(String urlName){
String str = null;
int start = urlName.lastIndexOf("/");
int end = urlName.length();
str = urlName.substring(start+1, end);
return str;
}
}
ImageFile類(儲存圖片檔案):
package com.get;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
public class ImageFile implements Runnable {
private String url;
private String name;
private static final String PATH = "D:\\img\\";
static HostnameVerifier ignoreHostnameVerifier = new HostnameVerifier(){
@Override
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
};
public ImageFile(String url,String name){
this.url = url;
this.name = name;
}
@Override
public void run() {
OutputStream os = null;
InputStream in = null;
SSLSocketFactory ssf = null;
File dir = new File(PATH);
if (!dir.exists()) {
dir.mkdirs();
System.out.println("圖片存放於"+PATH+"目錄下");
}
File file = new File(PATH+name);
try {
os = new FileOutputStream(file);
URL u = new URL(this.url);
if (u.getProtocol().toLowerCase().equals("https")) {
HttpsURLConnection https = (HttpsURLConnection)u.openConnection();
https.setSSLSocketFactory(createSSL());
https.setConnectTimeout(5000);
https.setReadTimeout(5000);
https.setDoOutput(true);
https.setRequestMethod("GET");
https.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0");
https.connect();
System.out.println(https.getResponseCode() + " " + https.getResponseMessage());
in = https.getInputStream();
} else {
HttpURLConnection conn = (HttpURLConnection)u.openConnection();
conn.connect();
System.out.println(conn.getResponseCode() + " " + conn.getResponseMessage());
in = conn.getInputStream();
}
byte[] buff = new byte[1024];
while(true){
int readed = in.read(buff);//讀取內容長度
if(readed == -1){
break;
}
byte[] temp = new byte[readed];
System.arraycopy(buff, 0, temp, 0, readed);//內容複製
os.write(temp);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch(MalformedURLException e){
e.printStackTrace();
} catch(IOException e){
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally{
try {
os.close();
if(in != null)
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static SSLSocketFactory createSSL() throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, KeyStoreException, CertificateException, FileNotFoundException, IOException{
TrustManager[] tm =new TrustManager[]{
myTrustManager
};
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, tm, null);
SSLSocketFactory ssf = sslContext.getSocketFactory();
HttpsURLConnection.setDefaultHostnameVerifier(ignoreHostnameVerifier);
HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());
return ssf;
}
public static TrustManager myTrustManager = new X509TrustManager()
{
@Override
public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {}
@Override
public void checkServerTrusted(X509Certificate[] arg0, String arg1){}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
}
主函式:
package com.get;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
import java.util.ArrayList;
import java.util.List;
public class Main_getImage {
public static void main(String[] args) throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, IOException {
String html = null;
List<String> list = new ArrayList<String>();//存放ImageURL
try {
html = Utils.getHtml("http://m.lashou.com/");//返回的是字串Html
} catch (IOException e) {
e.printStackTrace();
}
//利用工具類,獲取每張圖片的URL
list = Utils.getImageUrl(html, Utils.HTTP_IMG);
for(String string :list){
if(string.indexOf(".gif") != (string.length()-4) ||string.indexOf(".jpg") != (string.length()-4)
|| string.indexOf(".png") != (string.length()-4)){
//需改進
String s[] = string.split("\" original=\"");
for(String ss:s){
System.out.println(ss+"\n");
new Thread(new ImageFile(ss,Utils.getImageName(ss))).start();
}
}else{
System.out.println(string+"\n");
new Thread(new ImageFile(string,Utils.getImageName(string))).start();
System.out.println(Utils.getImageName(string));
}
}
}
}