java網路程式設計____最簡單的爬蟲(爬取網站美女圖片)
阿新 • • 發佈:2019-01-07
package com.company.reptile; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /*** * @author God * @see java抓取網站 美女圖片 * @info 一個簡單的爬蟲 不涉及廣度優先和深度優先 僅僅做為理解 */ public class JavaReptileUtil { // 地址 private static final String WEB_SITE = "http://www.4493.com"; // 獲取img標籤正則 private static final String IMAGE_TAG_REG = "<img.*src=(.*?)[^>]*?>"; // 獲取src路徑的正則 private static final String IMAGE_SRC_REG = "http:\"?(.*?)(\"|>|\\s+)"; /** * 測試小爬蟲 * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // 1.獲取美女圖片官網地址 String htmlInfo = getHtmlInfo(WEB_SITE); //獲取圖片url連結地址 List<String> imageSrc = getImageSrc(htmlInfo); //下載美女圖片 downloadImage(imageSrc); } /** * 解析html頁面 * @param host * @return * @throws Exception */ public static String getHtmlInfo(String host) throws Exception{ URL url=new URL(host); URLConnection urlConnection = url.openConnection(); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream())); String buffer=null; StringBuffer sbf=new StringBuffer(); while((buffer=bufferedReader.readLine())!=null){ sbf.append(buffer); } return sbf.toString(); } /** * 解析所有的image標籤文的src屬性 * @param args * @throws Exception */ public static List<String> getImageSrc(String htmlInfo){ Matcher matcher_image = Pattern.compile(IMAGE_TAG_REG).matcher(htmlInfo); List<String> imageSrc = new ArrayList<String>(); while (matcher_image.find()) { Matcher matcher_src = Pattern.compile(IMAGE_SRC_REG).matcher(matcher_image.group()); while (matcher_src.find()) { imageSrc.add(matcher_src.group().substring(0, matcher_src.group().length() - 1)); } } return imageSrc; } /** * 下載爬到的url連結 * @param imageSrc * @throws Exception */ public static void downloadImage(List<String> imageSrc) throws IOException { for (String src : imageSrc) { URL url =null; try { url = new URL(src); } catch (IOException e) { continue; } // 下在資源 DataInputStream dataInputStream = new DataInputStream(url.openStream()); FileOutputStream fileOutputStream = new FileOutputStream(new File("F:\\beauty\\" + NetUtil.getStrName(src))); byte[] bytes = new byte[1024]; int length = 0; while ((length = dataInputStream.read(bytes)) != -1) { fileOutputStream.write(bytes, 0, length); System.out.println("下載中...."); } System.out.println("下載完成..."); dataInputStream.close(); fileOutputStream.close(); } } }
//
package com.company.reptile;
public class NetUtil {
/**
* 獲取url連結的圖片名稱
* @param url
* @return
*/
public static String getStrName(String url) {
String[] sarry = url.split("/");
return sarry[sarry.length - 1];
}
}
//執行結果
//圖片