php多線程爬蟲類
阿新 • • 發佈:2018-06-06
php 多線程 爬蟲類 代碼:
測試:
輸出:
<?php /** * @desc:多線程爬蟲類 * @author [Lee] <[<[email protected]>]> * @property * 1、calltrigger 觸發爬蟲程序的回調函數 * 2、calltodo 處理業務邏輯的回調函數 如:把抓取到的內容處理後存到數據庫 * 3、timeout 超時時間,默認5秒 * 4、depth 重定向深度,默認3 * 5、name 上傳文件的名字,默認file * 6、cookie 模擬登錄時cookie存儲在本地的文件,默認cookie_n.txt * @method * 1、ssl 是否設置https true:是 false:否 * 2、auth 啟用驗證 user:用戶名 pass:密碼 * 3、login 模擬登錄,獲取cookie * 4、cookie 使用cookie登錄 * 5、header 設置請求頭 data:請求頭數組 * 6、proxy 設置服務器代理 url:代理服務器url port:代理服務器端口 * 7、agent 設置瀏覽器代理 browse:代理瀏覽器 默認:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) * 8、get 模擬get請求 data:傳遞的數據 * 9、post 模擬post請求 data:傳遞的數據 * 10、json 模擬json請求 data:傳遞的數據 * 11、upload 模擬表單上傳 files:上傳的文件 array|string * 12、download 下載文件 dir:要下載的文件 格式:a/b * 13、run 執行 depth:深度 */ class crawl{ public $calltrigger = ‘trigger‘; # 觸發爬蟲程序的回調函數 public $calltodo = ‘todo‘; # 處理業務邏輯的回調函數 public $timeout = 5; # 超時時間,默認5秒 public $depth = 3; # 重定向深度,默認3 public $name = ‘file‘; # 上傳文件的名字,默認file public $cookie = ‘cookie.txt‘; # 模擬登錄時cookie存儲在本地的文件,默認cookie_n private $schemes = array(); private $hosts = array(); private $paths = array(); private $querys = array(); private $options = array(); private $chs; private $fps; private $handle; private $urls = array(); /* @desc:內部方法,獲取頁面中的超鏈接 @param content 頁面內容 @return urls 獲取到的超鏈接 */ private function geturl($content){ $preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); return $urls; } /* @desc:內部方法,修復不完整的url @param url 原始url @param url 修復好的url */ private function reviseurl($url){ $info = parse_url($url); $scheme = $info["scheme"]?:‘http‘; $user = $info["user"]; $pass = $info["pass"]; $host = $info["host"]; $port = $info["port"]; $path = $info["path"]; $url = $scheme . ‘://‘; if ($user && $pass) { $url .= $user . ":" . $pass . "@"; } $url .= $host; if ($port) { $url .= ":" . $port; } $url .= $path; return $url; } /* @desc:內部方法,調用回調函數進行業務處理 @param content 傳入到回調函數的參數 */ private function todo($content){ $calltodo = $this->calltodo; call_user_func($calltodo,$content); } /* @desc:觸發爬蟲程序的回調函數 @param urls 待處理的url數組 @param depth 處理深度 */ private function trigger($urls,$depth){ $calltrigger = $this->calltrigger; call_user_func($calltrigger,$urls,$depth); } /* @desc:內部方法 設置get請求參數 @param data 請求數據 */ private function setget($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = ($querys[$k] || !empty($data))?"?":""; $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k].$data; $this->options[$k][CURLOPT_URL] = $qurl; } return $this; } /* @desc:內部方法 設置post請求參數 @param data 請求數據 */ private function setpost($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = $query?"?":""; $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k]; $this->options[$k][CURLOPT_URL] = $qurl; $this->options[$k][CURLOPT_POST] = 1; $this->options[$k][CURLOPT_POSTFIELDS] = $data; } return $this; } /* @desc:內部方法 設置最終請求參數 */ private function setopt(){ $options = $this->options; foreach($options as $k=>$v){ curl_setopt_array( $this->chs[$k], $v ); } return $this; } /* @desc:構造方法 設置初始請求參數 @param urls 請求地址數組 */ public function __construct($urls){ $this->urls = $urls; $this->handle = curl_multi_init(); foreach($urls as $k=>$v){ $info = parse_url($v); $this->schemes[$k] = $info[‘scheme‘]?:‘http‘; $this->hosts[$k] = $info[‘host‘]; $this->paths[$k] = $info[‘path‘]; $this->querys[$k] = $info[‘query‘]; $this->chs[$k] = curl_init(); $this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout; $this->options[$k][CURLOPT_RETURNTRANSFER] = 1; $this->options[$k][CURLOPT_FOLLOWLOCATION] = 1; $this->options[$k][CURLINFO_HEADER_OUT] = true; $this->options[$k][CURLOPT_ENCODING] = ‘gzip‘; $this->options[$k][CURLOPT_MAXREDIRS] = $this->depth; curl_multi_add_handle ($this->handle,$this->chs[$k]); } } /* @desc:是否設置https請求 @param bool true:https請求 false:http請求 */ public function ssl($bool = false){ if($bool){ foreach($this->chs as $k=>$v){ $this->scheme[$k] = ‘https‘; $this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1; $this->options[$k][CURLOPT_SSL_VERIFYPEER] = false; } } return $this; } /* @desc:設置驗證用戶名、密碼 @param user 用戶名 @param pass 密碼 */ public function auth($user,$pass){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERPWD] = $user.‘:‘.$pass; } return $this; } /* @desc:模擬登錄 */ public function login(){ $cookie = $this->cookie; $arr = explode(‘.‘,$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEJAR] = $name.‘_‘.$k.‘.‘.$ext; $this->options[$k][CURLOPT_RETURNTRANSFER] = 0; } return $this; } /* @desc:帶cookie登錄 */ public function cookie(){ $cookie = $this->cookie; $arr = explode(‘.‘,$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEFILE] = $name.‘_‘.$k.‘.‘.$ext; } return $this; } /* @desc:設置請求頭信息 @param data 請求頭 */ public function header($data){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array(); $this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data); } return $this; } /* @desc:設置代理服務器 @param url 代理服務器url @param port 代理服務器端口 */ public function proxy($url,$port){ $info = parse_url($url); $scheme = $info[‘scheme‘]?:‘http‘; $host = $info[‘host‘]; $path = $info[‘path‘]; $purl = $scheme.‘://‘.$host.$path.‘:‘.$port; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_PROXY] = $purl; } return $this; } /* @desc:設置代理瀏覽器 @param browse 代理瀏覽器 */ public function agent($browse = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)‘){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERAGENT] = $browse; } return $this; } /* @desc:模擬get請求 @param data 請求數據 */ public function get($data = array()){ $data = http_build_query($data); $this->setget($data); return $this; } /* @desc:模擬post請求 @param data 請求數據 */ public function post($data = array()){ $this->setpost($data); return $this; } /* @desc:模擬json請求 @param data 請求數據 */ public function json($data = array()){ $data = json_encode($data); $header = array( ‘Content-Type: application/json‘, ‘Content-Length:‘ . strlen($data) ); $this->header($header); $this->setpost($data); return $this; } /* @desc:模擬表單上傳 @param files 文件路徑 */ public function upload($files){ $data = array(); $name = $this->name; if(is_array($files)){ foreach($files as $k=>$v){ $data["{$name}[{$k}]"]=new CURLFile($v); } }else{ $data["{$name}"]=new CURLFile($files); } $this->setpost($data); return $this; } /* @desc:下載文件 @param dir 存儲文件目錄 */ public function download($dir = ‘‘){ $paths = $this->paths; if($dir && !is_dir($dir)){ mkdir($dir,0755,true); } foreach($this->paths as $k=>$v){ $name = strrchr($v, ‘/‘); $dsep = $dir?‘/‘:‘‘; $this->fps[$k]=fopen(‘.‘.$dsep.$dir.$name, ‘w‘); $this->options[$k][CURLOPT_FILE] = $this->fps[$k]; } $this->setget(‘‘); return $this; } /* @desc:執行方法 @param depth 深度 默認2 */ public function run($depth = 2){ $this->setopt(); $chs = $this->chs; $handle = $this->handle; $urls = $this->urls; if($depth > 0){ $depth--; $active = null; $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } while ($active && $mrc == CURLM_OK) { if (curl_multi_select($handle) != -1) { usleep(100); } $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } } foreach ($chs as $k => $v) { if (curl_error($chs[$k]) == "") { $content = curl_multi_getcontent($chs[$k]); $this->todo($content); $aurls = $this->geturl($content); $urls[$k] = $this->reviseurl($urls[$k]); if (is_array($aurls) && !empty($aurls)) { foreach ($aurls as $k1=>$u) { if (preg_match(‘/^http/‘, $u)) { $returl[$k1] = $u; } else { $real = $urls[$k] . ‘/‘ . $u; $returl[$k1] = $real; } } $this->trigger($returl,$depth); } } curl_multi_remove_handle($handle, $chs[$k]); curl_close($chs[$k]); } curl_multi_close($handle); } } }
function todo($content){
echo ‘ok‘.PHP_EOL;
}
$urls=array(
‘www.baidu.com‘,
‘www.taobao.com‘
);
function trigger($urls = array(),$depth = 2){
$crawl = new crawl($urls);
$crawl->get()->run($depth);
}
trigger($urls);
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
php多線程爬蟲類