1. 程式人生 > >php多線程爬蟲類

php多線程爬蟲類

php 多線程 爬蟲類

  • 代碼:
    <?php
    /**
    * @desc:多線程爬蟲類
    * @author [Lee] <[<[email protected]>]>
    * @property
    * 1、calltrigger    觸發爬蟲程序的回調函數
    * 2、calltodo       處理業務邏輯的回調函數 如:把抓取到的內容處理後存到數據庫
    * 3、timeout        超時時間,默認5秒
    * 4、depth          重定向深度,默認3
    * 5、name           上傳文件的名字,默認file
    * 6、cookie         模擬登錄時cookie存儲在本地的文件,默認cookie_n.txt
    * @method
    * 1、ssl            是否設置https           true:是  false:否
    * 2、auth           啟用驗證                user:用戶名    pass:密碼
    * 3、login          模擬登錄,獲取cookie
    * 4、cookie         使用cookie登錄
    * 5、header         設置請求頭              data:請求頭數組
    * 6、proxy          設置服務器代理          url:代理服務器url   port:代理服務器端口
    * 7、agent          設置瀏覽器代理          browse:代理瀏覽器 默認:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
    * 8、get            模擬get請求             data:傳遞的數據
    * 9、post           模擬post請求            data:傳遞的數據
    * 10、json          模擬json請求            data:傳遞的數據
    * 11、upload        模擬表單上傳            files:上傳的文件   array|string
    * 12、download      下載文件                dir:要下載的文件  格式:a/b
    * 13、run           執行                    depth:深度
    */
    class crawl{
    public $calltrigger = ‘trigger‘;  #  觸發爬蟲程序的回調函數
    public $calltodo = ‘todo‘;  #  處理業務邏輯的回調函數 
    public $timeout = 5;  #  超時時間,默認5秒
    public $depth = 3;  #  重定向深度,默認3
    public $name = ‘file‘;  #  上傳文件的名字,默認file
    public $cookie = ‘cookie.txt‘;  #  模擬登錄時cookie存儲在本地的文件,默認cookie_n
    private $schemes = array();
    private $hosts = array();
    private $paths = array();
    private $querys = array();
    private $options = array();
    private $chs;
    private $fps;
    private $handle;
    private $urls = array();
    /*
     @desc:內部方法,獲取頁面中的超鏈接
     @param content 頁面內容
     @return urls 獲取到的超鏈接
     */
    private function geturl($content){
        $preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘;
        $bool = preg_match_all($preg,$content,$res);
        $urls = array();
        if($bool){
            $urls = $res[1];
        }
        $urls = array_unique($urls);
        return $urls;
    }
    /*
     @desc:內部方法,修復不完整的url
     @param url 原始url
     @param url 修復好的url
     */
    private function reviseurl($url){
        $info = parse_url($url);
        $scheme = $info["scheme"]?:‘http‘;
        $user = $info["user"];
        $pass = $info["pass"];
        $host = $info["host"];
        $port = $info["port"];
        $path = $info["path"];
        $url = $scheme . ‘://‘;
        if ($user && $pass) {
            $url .= $user . ":" . $pass . "@";
        }
        $url .= $host;
        if ($port) {
            $url .= ":" . $port;
        } 
        $url .= $path;
        return $url;
    }
    /*
     @desc:內部方法,調用回調函數進行業務處理
     @param content 傳入到回調函數的參數
     */
    private function todo($content){
        $calltodo = $this->calltodo;
        call_user_func($calltodo,$content);
    }
    /*
     @desc:觸發爬蟲程序的回調函數
     @param urls 待處理的url數組
     @param depth 處理深度
     */
    private function trigger($urls,$depth){
        $calltrigger = $this->calltrigger;
        call_user_func($calltrigger,$urls,$depth);
    }
    /*
     @desc:內部方法 設置get請求參數
     @param data 請求數據
     */
    private function setget($data){
        $schemes = $this->schemes;
        $hosts = $this->hosts;
        $paths = $this->paths;
        $querys = $this->querys;
        foreach($this->chs as $k=>$v){
            $sep = ($querys[$k] || !empty($data))?"?":"";
            $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k].$data;
            $this->options[$k][CURLOPT_URL] = $qurl;
        }
        return $this;
    }
    /*
     @desc:內部方法 設置post請求參數
     @param data 請求數據
     */
    private function setpost($data){
        $schemes = $this->schemes;
        $hosts = $this->hosts;
        $paths = $this->paths;
        $querys = $this->querys;
        foreach($this->chs as $k=>$v){
            $sep = $query?"?":"";
            $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k];
            $this->options[$k][CURLOPT_URL] = $qurl;
            $this->options[$k][CURLOPT_POST] = 1;
            $this->options[$k][CURLOPT_POSTFIELDS] = $data;
        }
        return $this;
    }
    /*
     @desc:內部方法 設置最終請求參數
     */
    private function setopt(){
        $options = $this->options;
        foreach($options as $k=>$v){
            curl_setopt_array(
                    $this->chs[$k],
                    $v
                );
        }
        return $this;
    }
    /*
     @desc:構造方法 設置初始請求參數
     @param urls 請求地址數組
     */
    public function __construct($urls){
        $this->urls = $urls;
        $this->handle = curl_multi_init();
        foreach($urls as $k=>$v){
            $info = parse_url($v);
            $this->schemes[$k] = $info[‘scheme‘]?:‘http‘;
            $this->hosts[$k] = $info[‘host‘];
            $this->paths[$k] = $info[‘path‘];
            $this->querys[$k] = $info[‘query‘];
            $this->chs[$k] = curl_init();
            $this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout;
            $this->options[$k][CURLOPT_RETURNTRANSFER] = 1;
            $this->options[$k][CURLOPT_FOLLOWLOCATION] = 1;
            $this->options[$k][CURLINFO_HEADER_OUT] = true;
            $this->options[$k][CURLOPT_ENCODING] = ‘gzip‘;
            $this->options[$k][CURLOPT_MAXREDIRS] = $this->depth;
            curl_multi_add_handle ($this->handle,$this->chs[$k]);
        }
    }
    /*
     @desc:是否設置https請求
     @param bool true:https請求 false:http請求
     */
    public function ssl($bool = false){
        if($bool){
            foreach($this->chs as $k=>$v){
                $this->scheme[$k] = ‘https‘;
                $this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1;
                $this->options[$k][CURLOPT_SSL_VERIFYPEER] = false;
            }
        }
        return $this;
    }
    /*
     @desc:設置驗證用戶名、密碼
     @param user 用戶名
     @param pass 密碼
     */
    public function auth($user,$pass){
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_USERPWD] = $user.‘:‘.$pass;
        }
        return $this;
    }
    /*
     @desc:模擬登錄
     */
    public function login(){
        $cookie = $this->cookie;
        $arr = explode(‘.‘,$cookie);
        $name = $arr[0];
        $ext = $arr[1];
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_COOKIEJAR] = $name.‘_‘.$k.‘.‘.$ext;
            $this->options[$k][CURLOPT_RETURNTRANSFER] = 0;
        }
        return $this;
    }
    /*
     @desc:帶cookie登錄
     */
    public function cookie(){
        $cookie = $this->cookie;
        $arr = explode(‘.‘,$cookie);
        $name = $arr[0];
        $ext = $arr[1];
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_COOKIEFILE] = $name.‘_‘.$k.‘.‘.$ext;
        }
        return $this;
    }
    /*
     @desc:設置請求頭信息
     @param data 請求頭
     */
    public function header($data){
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array();
            $this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data);
        }
        return $this;
    }
    /*
     @desc:設置代理服務器
     @param url 代理服務器url
     @param port 代理服務器端口
     */
    public function proxy($url,$port){
        $info = parse_url($url);
        $scheme = $info[‘scheme‘]?:‘http‘;
        $host = $info[‘host‘];
        $path = $info[‘path‘];
        $purl = $scheme.‘://‘.$host.$path.‘:‘.$port;
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_PROXY] = $purl;
        }
        return $this;
    }
    /*
     @desc:設置代理瀏覽器
     @param browse 代理瀏覽器
     */
    public function agent($browse = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)‘){
        foreach($this->chs as $k=>$v){
            $this->options[$k][CURLOPT_USERAGENT] = $browse;
        }
        return $this;
    }
    /*
     @desc:模擬get請求
     @param data 請求數據
     */
    public function get($data = array()){
        $data = http_build_query($data);
        $this->setget($data);
        return $this;
    }
    /*
     @desc:模擬post請求
     @param data 請求數據
     */
    public function post($data = array()){
        $this->setpost($data);
        return $this;
    }
    /*
     @desc:模擬json請求
     @param data 請求數據
     */
    public function json($data = array()){
        $data = json_encode($data);
        $header = array(
                ‘Content-Type: application/json‘,
                ‘Content-Length:‘ . strlen($data)
            );
        $this->header($header);
        $this->setpost($data);
        return $this;
    }
    /*
     @desc:模擬表單上傳
     @param files 文件路徑
     */
    public function upload($files){
        $data = array();
        $name = $this->name;
        if(is_array($files)){
            foreach($files as $k=>$v){
                $data["{$name}[{$k}]"]=new CURLFile($v);
            }
        }else{
            $data["{$name}"]=new CURLFile($files);
        }
        $this->setpost($data);
        return $this;
    }
    /*
     @desc:下載文件
     @param dir 存儲文件目錄
     */
    public function download($dir = ‘‘){
        $paths = $this->paths;
        if($dir && !is_dir($dir)){
            mkdir($dir,0755,true);
        }
        foreach($this->paths as $k=>$v){
            $name = strrchr($v, ‘/‘);
            $dsep = $dir?‘/‘:‘‘;
            $this->fps[$k]=fopen(‘.‘.$dsep.$dir.$name, ‘w‘);
            $this->options[$k][CURLOPT_FILE] = $this->fps[$k];
        }
        $this->setget(‘‘);
        return $this;
    }
    /*
     @desc:執行方法
     @param depth 深度 默認2
     */
    public function run($depth = 2){
        $this->setopt();
        $chs = $this->chs;
        $handle = $this->handle;
        $urls = $this->urls;
        if($depth > 0){
            $depth--;
            $active = null;
            $mrc = curl_multi_exec($handle, $active);
            while ($mrc == CURLM_CALL_MULTI_PERFORM) {
                $mrc = curl_multi_exec($handle, $active);
            }
            while ($active && $mrc == CURLM_OK) {
                if (curl_multi_select($handle) != -1) {  
                    usleep(100);
                }
                $mrc = curl_multi_exec($handle, $active);
                while ($mrc == CURLM_CALL_MULTI_PERFORM) {
                    $mrc = curl_multi_exec($handle, $active);
                }
            }
            foreach ($chs as $k => $v) {
                if (curl_error($chs[$k]) == "") {
                    $content = curl_multi_getcontent($chs[$k]);
                    $this->todo($content);
                    $aurls = $this->geturl($content);
                    $urls[$k] = $this->reviseurl($urls[$k]);
                    if (is_array($aurls) && !empty($aurls)) {
                        foreach ($aurls as $k1=>$u) {
                            if (preg_match(‘/^http/‘, $u)) {
                                $returl[$k1] = $u;
                            } else {
                                $real = $urls[$k] . ‘/‘ . $u;
                                $returl[$k1] = $real;
                            }
                        }
                        $this->trigger($returl,$depth);
                    }
                }
                curl_multi_remove_handle($handle, $chs[$k]);  
                curl_close($chs[$k]);
            }
            curl_multi_close($handle);
        }
    }
    }
  • 測試:
    function todo($content){
    echo ‘ok‘.PHP_EOL;
    }
    $urls=array(
    ‘www.baidu.com‘,  
    ‘www.taobao.com‘
    );
    function trigger($urls = array(),$depth = 2){
    $crawl = new crawl($urls);
    $crawl->get()->run($depth);
    }
    trigger($urls);
  • 輸出:
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
    ok
  • php多線程爬蟲類