1. 程式人生 > >PHP抓取程式

PHP抓取程式

程式正在開發中,仍存在問題

<?php

function getUrlInfo($url,$suffix_name = 'html'){
	$url_info = parse_url($url);

	//設定預設埠
	empty($url_info['port']) && $url_info['port'] = 80;

	//設定預設路徑
	empty($url_info['path']) && $url_info['path'] = '/';

	//設定預設檔名稱
	if($url_info['path'] != '/'){
		$paths = explode('/', $url_info['path']);
		$last_path = array_pop($paths);

		if(strpos($last_path,'.') !== FALSE){
			$url_info['file_name'] = $last_path;
		}
	}


	if(!empty($url_info['query'])){
		$url_info['header_url'] = $url_info['path'].'?'.$url_info['query'];
	}else{
		$url_info['header_url'] = $url_info['path'];
	}

	if(empty($url_info['file_name'])){
		$url_info['file_name'] = md5($url_info['header_url']).'.'.$suffix_name;
		$url_info['file'] = rtrim($url_info['path'],'/').'/'.$url_info['file_name'];
	}else{
		$url_info['file'] = $url_info['path'];
	}

	$url_info['file'] = '/'.$url_info['host'].'/'.ltrim($url_info['file'],'/');

	return $url_info;
}

function getMainUrlPath($url){
	$url_info = parse_url($url);

	//設定預設埠
	$url_info['port'] = !empty($url_info['port']) ? ':'.$url_info['port'] : '' ;
	
	$main_url = $url_info['scheme'].$url_info['host'].$url_info['port'].'/';

	return $main_url;
}

function getFullUrlPath($url){
	$url = rtrim($url,'/').'/';
	$url_info = parse_url($url);
	$paths = explode('/', $url_info['path']);
	$last_path = array_pop($paths);

	if(strpos($last_path,'.') === FALSE){
		return $url;
	}else{
		return getMainUrlPath($url).implode('/', $paths).'/';
	}
}

/**
 * 獲取內容字尾名類
 */
class SuffixName{
	private $content_type = '';
	private $content = '';

	public function __construct($content_type,$content){
		$this->content_type = strtolower($content_type);
		$this->content = $content;
	}

	public function getSuffixName(){
		switch ($this->content_type) {
			case 'text/html':
				$suffix_name = 'html';
				break;
			case 'application/javascript':
				$suffix_name = 'js';
				break;
			case 'text/css':
				$suffix_name = 'css';
				break;
			case 'image/gif':
				$suffix_name = 'gif';
				break;
			case 'image/jpg':
				$suffix_name = 'jpg';
				break;
			case 'image/jpeg':
				$suffix_name = 'jpeg';
				break;
			case 'image/png':
				$suffix_name = 'png';
				break;		
			default:
				$suffix_name = 'html';
				break;
		}

		return $suffix_name;
	}
}

/**
 * 抓取類
 */
class Capture{
	private $content;
	private $url;
	private $headers = array();
	private $content_urls = array();
	private $reset_content_urls = array();

	// private $capture_tool_obj = null;
	// private $suffix_name_obj = null;
	// private $create_file_obj = null;

	public function __construct($url){
		$this->url = $url;

		//獲取內容
		$capture_tool_obj = new CaptureTool($url);
		$this->content = $capture_tool_obj->getContent();
		$this->headers = $capture_tool_obj->getHeaders();

		//更新url地址
		$this->setHtmlContentSrc();
		$this->resetContentSrc();

		//獲取字尾名
		$suffix_name_obj = new SuffixName($this->headers['content-type'],$this->content);
		$suffix_name = $suffix_name_obj->getSuffixName();

		$file = getUrlInfo($url,$suffix_name)['file'];
		$create_file_obj = new CreateFile($file,$this->content);
	}

	private function setHtmlContentSrc(){
		$css_pattern = "/<link[\s\S]*?href=(?:\"|\')(.*?)(?:\"|\')[\s\S]*?>/i";
		preg_match_all($css_pattern, $this->content, $css_matches) && $this->content_urls = array_merge($this->content_urls,$css_matches[1]);

		$src_pattern = "/src=(?:\"|\')(.*?)(?:\"|\')/i";
		preg_match_all($src_pattern, $this->content, $src_matches) && $this->content_urls = array_merge($this->content_urls,$src_matches[1]);

		//判斷是否為正確的url資訊
		foreach($this->content_urls as &$content_url){
			if(empty(parse_url($content_url,PHP_URL_HOST))){
				if($content_url[0] == '/'){
					//絕對路徑
					$content_url = rtrim(getMainUrlPath($this->url),'/').$content_url;
				}else{
					//相對路徑
					$content_url = getFullUrlPath($this->url).$content_url;
				}
			}
		}

		//設定要更新的內容
		$this->reset_content_urls = $this->content_urls;
		$this->reset_content_urls = array_map(function($url){ return str_replace('http://', '../', $url); }, $this->reset_content_urls);
	}

	public function getHtmlContentSrc(){
		return $this->content_urls;
	}

	private function resetContentSrc(){
		$this->content = str_replace($this->content_urls,$this->reset_content_urls,$this->content);
	}
}

/**
 * 抓取內容工具類
 */
class CaptureTool{
	private $tool = 'fopenSockCapture';
	private $url = '';
	private $content = '';
	private $headers = array();

	public function __construct($url = ''){
		if(empty($url)){
			return FALSE;
		}
		$this->url = $url;

		$this->fopenSockCapture();
	}

	private function fopenSockCapture(){
		$url_info = getUrlInfo($this->url);
		$fp = fsockopen($url_info['host'],$url_info['port'],$error_no,$error_str,30);

		if(empty($fp)){
			exit('fail');
		}

		$out = 'GET '.$url_info['header_url'].' HTTP/1.1'."\r\n";
		$out .= 'Host: '.$url_info['host']."\r\n";
		$out .= 'Connection: close'."\r\n";
		$out .= 'Pragma: no-cache'."\r\n";
		$out .= 'Cache-Control: no-cache'."\r\n";
		$out .= 'User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'."\r\n";
		$out .= 'Accept-Language: zh-CN,zh;q=0.8'."\r\n";
		$out .= "\r\n";

		fwrite($fp, $out);

		$result = '';
		while(!feof($fp)){
			$result .= fread($fp, 1024);	
		}

		list($headers_str,$body) = explode("\r\n\r\n", $result,2);
		fclose($fp);

		$this->headers = $this->parseHearderStrToArr($headers_str);
		$this->content = $body;
	}

	private function parseHearderStrToArr($headers_str = ''){
		if(empty($headers_str)){
			return array();
		}

		$headers = explode("\r\n",$headers_str);

		$headers_arr = array();
		foreach($headers as $header){
			list($key,$value) = explode(' ',$header,2);
			$key = strtolower(trim($key,':'));
			$headers_arr[$key] = $value;
		}

		return $headers_arr;
	}

	public function getContent(){
		return $this->content;
	}

	public function getHeaders(){
		return $this->headers;
	}
}

/**
 * 生成檔案類
 */
class CreateFile{
	private $dir = '';
	private $file_name = '';
	private $content = '';
	private $file = '';
	private $pre_create_dir = './capture/';

	public function __construct($file,$content){
		$file_info = pathinfo($file);
		$this->dir = $this->pre_create_dir.trim($file_info['dirname'],'/');
		$this->file = $this->pre_create_dir.trim($file,'/');
		$this->file_name = $file_info['basename'];

		$this->content = $content;

		$this->createDir();

		$this->setContent();
	}

	private function createDir(){
		$dirs = explode('/', $this->dir);
		$exist_dir = '';

		foreach ($dirs as $v) {
			if(!is_dir($exist_dir.$v)){
				mkdir($exist_dir.$v);
			}
			$exist_dir .= $v.'/';
		}
	}

	private function setContent(){
		return file_put_contents($this->file, $this->content);
	}
}

$arr = array('http://127.0.0.1/phpcms/install_package/');

while(1){
	if(!empty($arr)){
		$v = array_shift($arr);
		$capture_obj = new Capture($v);
		$arr = array_merge($arr,$capture_obj->getHtmlContentSrc());
	}else{
		exit(1);
	}
}

// var_dump(parse_url('http://www.baidu.com/aaa.css'));