jsoup爬取指定網頁的url和圖片
阿新 • • 發佈:2019-01-29
其中TestHttpClient用了上一篇文章的類!import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.concurrent.CountDownLatch; /** * Created by Administrator on 2016/3/10. */ public class Crawler { private static final Object signal=new Object(); private static int count=0; public static String getRequestByURL(String url){ System.out.println("開啟網頁----"); try { String entity = TestHTTPClient.get(url); return entity; } catch (Exception e) { System.out.println("開啟網頁出錯---"); e.printStackTrace(); return null; } } public static List<String> getArticleURLs(String pageContext){ List<String> articleURLs=new ArrayList<String>(); System.out.println("尋找專題"); Document document = Jsoup.parseBodyFragment(pageContext); Elements elements = document.select("span.link_title"); elements=elements.select("a"); for (Element element : elements) { articleURLs.add(element.attr("href")); } return articleURLs; } public static List<String> getImgURLS(String pageContext){ System.out.println("開始查詢圖片"); List<String> imgURLs=new ArrayList<String>(); Document document = Jsoup.parseBodyFragment(pageContext); Elements elements = document.select("a[target=_blank] img[src]"); for (Element element : elements) { imgURLs.add(element.attr("src")); } return imgURLs; } public static void savePic(String imgURL,String imgPath){ if (imgURL==null){ return; } String[] str = imgURL.split("/"); String fileName = str[str.length - 1]; String savePath=imgPath+ File.separator+fileName; HttpGet httpGet=new HttpGet(imgURL); CloseableHttpClient httpClient = TestHTTPClient.getHttpClient(); try { CloseableHttpResponse httpResponse = httpClient.execute(httpGet); HttpEntity entity = httpResponse.getEntity(); InputStream inputStream = entity.getContent(); OutputStream outputStream=new FileOutputStream(savePath); IOUtils.copy(inputStream,outputStream); IOUtils.closeQuietly(inputStream); IOUtils.closeQuietly(outputStream); System.out.println("儲存圖片成功!"); } catch (Exception e) { e.printStackTrace(); System.out.println("圖片儲存失敗!"); } } public static void begin(final String pageContext){ for (int i=0;i<10;i++){ new Thread(new Runnable() { @Override public void run() { while (true){ System.out.println("當前進入執行緒的是:"+Thread.currentThread().getName()); List<String> imgURLS = getImgURLS(pageContext); if (imgURLS!=null&&imgURLS.size()>0){ } } } }); } } public static void main(String[] args) { // 爬取個人csdn目錄檢視所有文章 // String url="http://blog.csdn.net/huxiweng/article/list/"; // int maxPage=6; // for (int i=0;i<maxPage;i++){ // String pageContext= getRequestByURL(url+""+(i+1)); // System.out.println("開始尋找第"+(i+1)+"頁面文章"); // List<String> articleURLs = getArticleURLs(pageContext); // for (String articleURL : articleURLs) { // System.out.println(articleURL); // } // } //爬取火狐瀏覽器圖片頻道的圖片 final String imgPath="E:/img"; File file=new File(imgPath); if (!file.exists()){ file.mkdir(); } String url="http://photo.firefox.163.com/"; System.out.println("begin"); long begin = System.currentTimeMillis(); for (int i=0;i<10;i++){ System.out.println("爬取第"+i+1+"圖片"); String pageContext = getRequestByURL(url+"roll_"+(i+1)+".html"); List<String> imgURLS = getImgURLS(pageContext); for (String imgURL : imgURLS) { savePic(imgURL,imgPath); // System.out.println(imgURL); } } System.out.println("耗時:"+String.valueOf(System.currentTimeMillis()-begin)); } }