爬蟲：實現網站的全部圖片抓取

阿新 • • 發佈：2018-12-24

/**
 * Created by lewis on 2016/10/21.
 */
public class PictMsg {
    private String url;
    private String headline;

    public PictMsg(String url, String headline) {
        this.url = url;
        this.headline = headline;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getHeadline() {
        return headline;
    }

    public void setHeadline(String headline) {
        this.headline = headline;
    }

    @Override
    public String toString() {
        return "網址："+url+"標題："+headline;
    }
}

        import org.apache.http.HttpEntity;
        import org.apache.http.client.methods.CloseableHttpResponse;
        import org.apache.http.client.methods.HttpGet;
        import org.apache.http.impl.client.CloseableHttpClient;
        import org.apache.http.impl.client.HttpClients;

        import java.io.*;
        import java.util.ArrayList;
        import java.util.regex.Matcher;
        import java.util.regex.Pattern;

/**
 * Created by lewis on 2016/10/20.
 */
public class DownLoad {

    public static CloseableHttpClient httpClient = HttpClients.custom().build();

    public static String downloadHtml(String url) {

        CloseableHttpResponse response = null;
        BufferedReader br=null;
        HttpGet httpGet = new HttpGet(url);

        try {
            response = httpClient.execute(httpGet);
            HttpEntity entity = response.getEntity();
            InputStreamReader isr = new InputStreamReader(entity.getContent(),"gb2312");

            StringBuilder stringBuilder =new StringBuilder();
            br =new BufferedReader(isr);
            String line =null;
            while((line=br.readLine())!=null){
                stringBuilder.append(line+'\n');
            }
            return stringBuilder.toString();
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(br!=null){
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return null;
    }

    public static void downloadPict(PictMsg pictMsg,int count) {
        String url=pictMsg.getUrl();
        CloseableHttpResponse response;
        OutputStream out = null;
        InputStream in=null;
        BufferedReader br=null;
        byte buffer[] = new byte[1024];
        if(url!=null){
            try {
                HttpGet httpGet = new HttpGet(url);
                response = httpClient.execute(httpGet);
                HttpEntity entity = response.getEntity();
                in = entity.getContent();
                CreateDir("D:\\youmzi"+File.separator+pictMsg.getHeadline());
                String suffix;
                if(url.charAt(url.length()-1)=='g') {
                    suffix=".jpg";
                }
                else{
                    suffix=".gif";
                }
                System.out.print("正在下載："+"D:\\youmzi"+File.separator+pictMsg.getHeadline()+File.separator+count+suffix+":");
                out = new FileOutputStream(new File("D:\\youmzi"+File.separator+pictMsg.getHeadline()+File.separator+count+suffix));
                int index=0;
                while((index=in.read(buffer))!=-1){
                    out.write(buffer,0,index);
                }
                out.flush();
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                try {
                    if (br!=null){
                        br.close();
                    }
                    if(out!=null){
                        out.close();
                    }
                    if(in!=null){
                        in.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }

    public static void downloadPict(ArrayList<PictMsg> Pict_link){

        for(int i = 0;i< Pict_link.size();i++){
            // Main.print(Pict_link.get(i));
            if(Pict_link.get(i)!=null)
                DownLoad_All_PictSoruce(Pict_link.get(i));
        }
        Pict_link.clear();
    }

    public static void CreateDir(String dir){
        File file = new File(dir);
        if(!file.exists()){
            file.mkdir();
        }
    }

    public static void DownLoad_All_PictSoruce(PictMsg pictMsg){
        ArrayList<String> All_Pict_Soruce = new ArrayList<>();
        String  url =pictMsg.getUrl();
        All_Pict_Soruce.add(url);

        while(Find_Link.Add_Page_Link(url,All_Pict_Soruce)){     //通過迴圈一直找到最後一個頁面
            url=All_Pict_Soruce.get(All_Pict_Soruce.size()-1);
        }

        for(int i =0;i<All_Pict_Soruce.size();i++){
            //Main.print(Pict_down_Soruce(All_Pict_Soruce.get(i)));
            if(All_Pict_Soruce.get(i)!=null){
                String link=Pict_down_Soruce(All_Pict_Soruce.get(i));
                if(!Main.set.contains(link)) {
                    downloadPict(new PictMsg(link, pictMsg.getHeadline()), i);
                    System.out.println("一共有："+All_Pict_Soruce.size()+","+"還剩下："+(All_Pict_Soruce.size()-i));
                    Main.set.add(link);
                }
            }
        }
        All_Pict_Soruce.clear();
    }

    public static String Pict_down_Soruce(String url){
        String context = DownLoad.downloadHtml(url);
        String pa;
        Pattern r;
        Matcher m ;
        pa="<img src='(.+?)' alt=";
        r= Pattern.compile(pa);
        m = r.matcher(context);
        if(m.find(0)){
            return m.group(1);
        }
        return null;
    }
}

爬蟲：實現網站的全部圖片抓取

/** * Created by lewis on 2016/10/21. */ public class PictMsg { private String url; private String headline; public PictMsg(String url, Stri

python爬蟲實戰---今日頭條的圖片抓取

本文是主要在今日頭條裡面的以“街拍路人”為搜尋條件去提取網頁的圖片和標題，並把標題當做資料夾的名稱，建立該資料夾，把圖片儲存到相應的資料夾下。匯入庫 from urllib.parse import urlencode---把字典裡面的資料拼接成如下字串格式： url

綜合使用python爬蟲技術，selenium模組動態抓取“視覺中國”網站上的圖片的url

一、匯入模組 import time from selenium import webdriver from lxml import etree 本文章純粹用來練手，於是我使用了etree,其實光使用find_elements…的方法也可以二、開始幹活 1.

Nginx反爬蟲攻略：禁止某些User Agent抓取網站

我們都知道網路上的爬蟲非常多，有對網站收錄有益的，比如百度蜘蛛（Baiduspider），也有不但不遵守robots規則對伺服器造成壓力，還不能為網站帶來流量的無用爬蟲，比如宜搜蜘蛛（YisouSpider）（最新補充

[Python爬蟲] 之十九：Selenium +phantomjs 利用 pyquery抓取超級TV網數據

images 判斷 nco dex onf etc lac lin 利用　　一、介紹　　　　本例子用Selenium +phantomjs爬取超級TV（http://www.chaojitv.com/news/index.html）的資訊信息，輸入給定關鍵字抓取

為何大量網站不能抓取?爬蟲突破封禁的6種常見方法 - 轉載

9.png 禁止 asi 屬於用戶訪問文件權限設置初始化大型右移傳送門：http://www.cnblogs.com/junrong624/p/5533655.html 在互聯網上進行自動數據采集（抓取）這件事和互聯網存在的時間差不多一樣長。今天大眾好像更傾向於

網站爬取-案例二：天貓爬取( 第一卷：首頁數據抓取)

img .com 我想提供商網站 col class scoller bubuko 說到網站數據的爬取，目前為止我見過最復雜的就是天貓了，現在我想對它進行整站的爬取我們先來看下天貓主頁的界面天貓頁面很明顯是動態頁面所以我們需要用selenium模塊首先

Python爬蟲教程：簡書文章的抓取與儲存

本文內容將與大家一起從簡書的文章頁面抓取文章標題、作者、釋出時間以及正文內容，並且將抓取到的這些資訊存入Excel表格中。本文對簡書文章的抓取僅為Python的學習交流，尊重作者著作權，不對抓取到的文章做其他用途。本文使用Chrome瀏覽器對頁面中需要抓取的內容進行分析。首先我們從簡書

為何大量網站不能抓取?爬蟲突破封禁的6種常見方法

在網際網路上進行自動資料採集（抓取）這件事和網際網路存在的時間差不多一樣長。今天大眾好像更傾向於用“網路資料採集”，有時會把網路資料採集程式稱為網路機器人（bots）。最常用的方法是寫一個自動化程式向網路伺服器請求資料（通常是用 HTML 表單或其他網頁檔案），然後對資料進行

Python3網路爬蟲：Scrapy入門實戰之爬取動態網頁圖片

Python版本： python3.+ 執行環境： Mac OS IDE： pycharm 一前言二 Scrapy相關方法介紹 1 搭建Scrapy專案 2 shell分析三網頁分析

Python爬蟲入門教程 18-100 煎蛋網XXOO圖片抓取

寫在前面很高興我這系列的文章寫道第18篇了，今天寫一個爬蟲愛好者特別喜歡的網站煎蛋網http://jandan.net/ooxx，這個網站其實還是有點意思的，網站很多人寫了N多的教程了，各種方式的都有，當然網站本身在爬蟲愛好者的不斷進攻下，也在不斷的完善，反爬措施也很多，今天我用selenium在揍他一波

初級爬蟲(一) requests模組實現網頁批量圖片爬取

思路分析: 已知網頁:如http://www.jiangxian.gov.cn/N20180821093426.html 1, 檢查網頁分析網頁中圖片的地址形式, 2,獲取網頁內容,正則匹配出所有圖片的地址, 3,拼接地址生成列表 4,迴圈列表,生成圖片地址的文字檔案 5,迴圈列表,取出

不和諧網站圖片抓取

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; u

PHP + curl 實現 http 或 https 抓取資料：

/** * 抓取資料 https 或 http 形式 * @param $url 連結 * @param $data 引數 * @return mixed 返回資料 */ private

Python爬蟲新手教程：知乎文章圖片爬取器

1. 知乎文章圖片爬取器之二部落格背景昨天寫了知乎文章圖片爬取器的一部分程式碼，針對知乎問題的答案json進行了資料抓取，部落格

使用Chrome快速實現數據的抓取（二）

run 描述管理 opp socket 請求 icon err protoc 在前面的文章簡單的介紹了一下Chrome調試模式的啟動方式，但前面的API只能做到簡單的打開，關閉標簽操作，當我們需要對某個標簽頁進行詳細的操作時，則需要用到頁面管理API。首先我們還是來回顧下

python 圖片抓取

meid pat book png lis time env cep efault 1、圖片地址為下載地址訪問圖片地址可直接下載的　　#!/usr/bin/env python　　# -*- coding: utf-8 -*-　　import urllib2　　impor

使用Chrome快速實現數據的抓取（四）——優點

一個 java 海量 height 調試工具 -1 idt socket程序格式些一個抓取WEB頁面的數據程序比較簡單，大多數語言都有相應的HTTP庫，一個簡單的請求響應即可，程序發送Http請求給Web服務器，服務器返回HTML文件。交互方式如下：　　在使用

使用Chrome快速實現數據的抓取（五）—— puppeteer

ref google rem 官方簡單的 code web 驅動 ace 如果要以自動化的方式驅動Chrome進行數據抓取，必須實現Chrome Dev Protocol協議的客戶端。這個協議本身並不復雜，我在之前的文章中也簡單的介紹過一下。 Google本身有一個No

python 爬蟲2-正則表達式抓取拉勾網職位信息

headers mode data .cn 保存 time exc href ace import requestsimport re #正則表達式import time import pandas #保存成 CSV #header={‘User-Agent‘:‘M

爬蟲：實現網站的全部圖片抓取

相關推薦