PHP抓取程式
阿新 • • 發佈:2019-01-04
程式正在開發中,仍存在問題
<?php function getUrlInfo($url,$suffix_name = 'html'){ $url_info = parse_url($url); //設定預設埠 empty($url_info['port']) && $url_info['port'] = 80; //設定預設路徑 empty($url_info['path']) && $url_info['path'] = '/'; //設定預設檔名稱 if($url_info['path'] != '/'){ $paths = explode('/', $url_info['path']); $last_path = array_pop($paths); if(strpos($last_path,'.') !== FALSE){ $url_info['file_name'] = $last_path; } } if(!empty($url_info['query'])){ $url_info['header_url'] = $url_info['path'].'?'.$url_info['query']; }else{ $url_info['header_url'] = $url_info['path']; } if(empty($url_info['file_name'])){ $url_info['file_name'] = md5($url_info['header_url']).'.'.$suffix_name; $url_info['file'] = rtrim($url_info['path'],'/').'/'.$url_info['file_name']; }else{ $url_info['file'] = $url_info['path']; } $url_info['file'] = '/'.$url_info['host'].'/'.ltrim($url_info['file'],'/'); return $url_info; } function getMainUrlPath($url){ $url_info = parse_url($url); //設定預設埠 $url_info['port'] = !empty($url_info['port']) ? ':'.$url_info['port'] : '' ; $main_url = $url_info['scheme'].$url_info['host'].$url_info['port'].'/'; return $main_url; } function getFullUrlPath($url){ $url = rtrim($url,'/').'/'; $url_info = parse_url($url); $paths = explode('/', $url_info['path']); $last_path = array_pop($paths); if(strpos($last_path,'.') === FALSE){ return $url; }else{ return getMainUrlPath($url).implode('/', $paths).'/'; } } /** * 獲取內容字尾名類 */ class SuffixName{ private $content_type = ''; private $content = ''; public function __construct($content_type,$content){ $this->content_type = strtolower($content_type); $this->content = $content; } public function getSuffixName(){ switch ($this->content_type) { case 'text/html': $suffix_name = 'html'; break; case 'application/javascript': $suffix_name = 'js'; break; case 'text/css': $suffix_name = 'css'; break; case 'image/gif': $suffix_name = 'gif'; break; case 'image/jpg': $suffix_name = 'jpg'; break; case 'image/jpeg': $suffix_name = 'jpeg'; break; case 'image/png': $suffix_name = 'png'; break; default: $suffix_name = 'html'; break; } return $suffix_name; } } /** * 抓取類 */ class Capture{ private $content; private $url; private $headers = array(); private $content_urls = array(); private $reset_content_urls = array(); // private $capture_tool_obj = null; // private $suffix_name_obj = null; // private $create_file_obj = null; public function __construct($url){ $this->url = $url; //獲取內容 $capture_tool_obj = new CaptureTool($url); $this->content = $capture_tool_obj->getContent(); $this->headers = $capture_tool_obj->getHeaders(); //更新url地址 $this->setHtmlContentSrc(); $this->resetContentSrc(); //獲取字尾名 $suffix_name_obj = new SuffixName($this->headers['content-type'],$this->content); $suffix_name = $suffix_name_obj->getSuffixName(); $file = getUrlInfo($url,$suffix_name)['file']; $create_file_obj = new CreateFile($file,$this->content); } private function setHtmlContentSrc(){ $css_pattern = "/<link[\s\S]*?href=(?:\"|\')(.*?)(?:\"|\')[\s\S]*?>/i"; preg_match_all($css_pattern, $this->content, $css_matches) && $this->content_urls = array_merge($this->content_urls,$css_matches[1]); $src_pattern = "/src=(?:\"|\')(.*?)(?:\"|\')/i"; preg_match_all($src_pattern, $this->content, $src_matches) && $this->content_urls = array_merge($this->content_urls,$src_matches[1]); //判斷是否為正確的url資訊 foreach($this->content_urls as &$content_url){ if(empty(parse_url($content_url,PHP_URL_HOST))){ if($content_url[0] == '/'){ //絕對路徑 $content_url = rtrim(getMainUrlPath($this->url),'/').$content_url; }else{ //相對路徑 $content_url = getFullUrlPath($this->url).$content_url; } } } //設定要更新的內容 $this->reset_content_urls = $this->content_urls; $this->reset_content_urls = array_map(function($url){ return str_replace('http://', '../', $url); }, $this->reset_content_urls); } public function getHtmlContentSrc(){ return $this->content_urls; } private function resetContentSrc(){ $this->content = str_replace($this->content_urls,$this->reset_content_urls,$this->content); } } /** * 抓取內容工具類 */ class CaptureTool{ private $tool = 'fopenSockCapture'; private $url = ''; private $content = ''; private $headers = array(); public function __construct($url = ''){ if(empty($url)){ return FALSE; } $this->url = $url; $this->fopenSockCapture(); } private function fopenSockCapture(){ $url_info = getUrlInfo($this->url); $fp = fsockopen($url_info['host'],$url_info['port'],$error_no,$error_str,30); if(empty($fp)){ exit('fail'); } $out = 'GET '.$url_info['header_url'].' HTTP/1.1'."\r\n"; $out .= 'Host: '.$url_info['host']."\r\n"; $out .= 'Connection: close'."\r\n"; $out .= 'Pragma: no-cache'."\r\n"; $out .= 'Cache-Control: no-cache'."\r\n"; $out .= 'User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'."\r\n"; $out .= 'Accept-Language: zh-CN,zh;q=0.8'."\r\n"; $out .= "\r\n"; fwrite($fp, $out); $result = ''; while(!feof($fp)){ $result .= fread($fp, 1024); } list($headers_str,$body) = explode("\r\n\r\n", $result,2); fclose($fp); $this->headers = $this->parseHearderStrToArr($headers_str); $this->content = $body; } private function parseHearderStrToArr($headers_str = ''){ if(empty($headers_str)){ return array(); } $headers = explode("\r\n",$headers_str); $headers_arr = array(); foreach($headers as $header){ list($key,$value) = explode(' ',$header,2); $key = strtolower(trim($key,':')); $headers_arr[$key] = $value; } return $headers_arr; } public function getContent(){ return $this->content; } public function getHeaders(){ return $this->headers; } } /** * 生成檔案類 */ class CreateFile{ private $dir = ''; private $file_name = ''; private $content = ''; private $file = ''; private $pre_create_dir = './capture/'; public function __construct($file,$content){ $file_info = pathinfo($file); $this->dir = $this->pre_create_dir.trim($file_info['dirname'],'/'); $this->file = $this->pre_create_dir.trim($file,'/'); $this->file_name = $file_info['basename']; $this->content = $content; $this->createDir(); $this->setContent(); } private function createDir(){ $dirs = explode('/', $this->dir); $exist_dir = ''; foreach ($dirs as $v) { if(!is_dir($exist_dir.$v)){ mkdir($exist_dir.$v); } $exist_dir .= $v.'/'; } } private function setContent(){ return file_put_contents($this->file, $this->content); } } $arr = array('http://127.0.0.1/phpcms/install_package/'); while(1){ if(!empty($arr)){ $v = array_shift($arr); $capture_obj = new Capture($v); $arr = array_merge($arr,$capture_obj->getHtmlContentSrc()); }else{ exit(1); } } // var_dump(parse_url('http://www.baidu.com/aaa.css'));