1. 程式人生 > 其它 >php curl 抓一個頁面,包含 imgs、css、js

php curl 抓一個頁面,包含 imgs、css、js

宣告

本分享純屬為了技術分享,禁止商用!!!禁止商用!!!禁止商用!!!
未經本人允許,如有發現,違者必究!!!

呼叫

ini_set('max_execution_time', 20);
$url='http://xxx.cn/';
$html_name='index'; // html 名稱
$type='pc'; // pc電腦 m手機

$dir=dirname(__FILE__).DIRECTORY_SEPARATOR.parse_url($url)['host'].DIRECTORY_SEPARATOR;
if ($type=='m') $static='statics_m'.DIRECTORY_SEPARATOR.$html_name.DIRECTORY_SEPARATOR;
else $static='statics'.DIRECTORY_SEPARATOR.$html_name.DIRECTORY_SEPARATOR;
$fh=file_get_contents($url);
if ($fh && !is_dir($dir)) mkdir($dir,0777,true);
$fh && file_put_contents($dir.$html_name.'.html',$fh) or die('內容為空');

$dir_imgs=$dir.$static.'imgs'.DIRECTORY_SEPARATOR;
$dir_css=$dir.$static.'css'.DIRECTORY_SEPARATOR;
$dir_js=$dir.$static.'js'.DIRECTORY_SEPARATOR;

$re_imgs=curl_img($url);
$re_imgs=json_decode($re_imgs,true);

$re_css=curl_css($url);
$re_css=json_decode($re_css,true);

$re_js=curl_js($url);
$re_js=json_decode($re_js,true);

// 批量
curl_downimg_multi($re_imgs,$dir_imgs,'GET','1');
curl_downcss_multi($re_css,$dir_css,'GET','1');
curl_downjs_multi($re_js,$dir_js,'GET','1');

方法

// ============ imgs ============

function curl_img($url='') {
    $ch=curl_init();
    $array=array(
        CURLOPT_URL => $url,
        CURLOPT_ENCODING => 'gzip,deflate',
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_HTTPHEADER => array(
            'pragma: no-cache',
            'cache-control: no-cache',
            'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
            'accept: application/json, text/plain, */*',
            'content-type: application/json',
            'sec-ch-ua-mobile: ?0',
            'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'sec-ch-ua-platform: "Windows"',
            'sec-fetch-site: same-origin',
            'sec-fetch-mode: cors',
            'sec-fetch-dest: empty',
            'accept-language: zh-CN,zh;q=0.9'
        )
    );
    curl_setopt_array($ch,$array);
    $output=curl_exec($ch);
    if (curl_errno($ch)) {
        return curl_error($ch);
    }
    curl_close($ch);
    $r=preg_match_all('/data-src="(.*)"/',$output,$arr); // 規則自己根據實際情況定義
    $items=$arr[1];
    $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
    array_walk($items,function(&$item) use ($url_main){
        $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
        return $item;
    });
    $items=array_filter($items);
    $items=array_unique($items);
    return json_encode($items,320);
}

/**
 * @param $url
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return string
 */
function curl_downimg($url='',$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'jpg';
    $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
    $ch=curl_init();
    $fp=fopen($file_path,'w');
    $arr=array(
        CURLOPT_URL => $url,
        CURLOPT_CUSTOMREQUEST => strtoupper($method),
//        CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//        CURLOPT_NOPROGRESS => 0,
        CURLOPT_HEADER => 0,
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_FILE => $fp,
    );
    curl_setopt_array($ch,$arr);
    $output=curl_exec($ch);
    $size=filesize($file_path);
    $info=curl_getinfo($ch);
    if (curl_errno($ch)) {
        fclose($fp);
        unlink($file_path);
        return curl_error($ch);
    } elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
        fclose($fp);
        unlink($file_path);
        return '資料不完整';
    }
    return 'ok';
}

/**
 * @param $arrs
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return void
 */
function curl_downimg_multi($arrs=array(),$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $conn=array();
    $file_path=array();
    $fp=array();
    $mh=curl_multi_init();
    foreach ($arrs as $k=>$v) {
        $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'jpg';
        $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
        $conn[$k]=curl_init();
        $fp[$k]=fopen($file_path[$k],'w');
        $arr=array(
            CURLOPT_URL => $v,
            CURLOPT_CUSTOMREQUEST => strtoupper($method),
//            CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//            CURLOPT_NOPROGRESS => 0,
            CURLOPT_HEADER => 0,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_SSL_VERIFYHOST => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_CONNECTTIMEOUT => 60,
            CURLOPT_TIMEOUT => 60,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_FILE => $fp[$k],
        );
        curl_setopt_array($conn[$k],$arr);
        curl_multi_add_handle($mh,$conn[$k]);
    }
    $active = null;
    do {
        curl_multi_exec($mh, $active);
        static $i=0;
        static $ok=0;
            while ($done=curl_multi_info_read($mh)) {
                if (curl_errno($done['handle'])) {
                    fclose($fp[$i]);
                    unlink($file_path[$i]);
                    curl_multi_remove_handle($mh,$done['handle']);
                    curl_close($done['handle']);
                    continue;
                }
                $info=curl_getinfo($done['handle']);
                $size=filesize($file_path[$i]);
                if ($info['http_code'] != '200') {
                    fclose($fp[$i]);
                    unlink($file_path[$i]);
                }
                curl_multi_remove_handle($mh,$done['handle']);
                curl_close($done['handle']);
                ++$i;
                ++$ok;
            }
    } while ($active > 0);
    curl_multi_close($mh);
    echo 'ok: '.$ok;
}

// ============ css ============

function curl_css($url='') {
    $ch=curl_init();
    $array=array(
        CURLOPT_URL => $url,
        CURLOPT_ENCODING => 'gzip,deflate',
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_HTTPHEADER => array(
            'pragma: no-cache',
            'cache-control: no-cache',
            'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
            'accept: application/json, text/plain, */*',
            'content-type: application/json',
            'sec-ch-ua-mobile: ?0',
            'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'sec-ch-ua-platform: "Windows"',
            'sec-fetch-site: same-origin',
            'sec-fetch-mode: cors',
            'sec-fetch-dest: empty',
            'accept-language: zh-CN,zh;q=0.9'
        )
    );
    curl_setopt_array($ch,$array);
    $output=curl_exec($ch);
    if (curl_errno($ch)) {
        return curl_error($ch);
    }
    curl_close($ch);
    preg_match_all('/<link(.*) href="(\S+\.css)"/',$output,$arr); // 規則自己根據實際情況定義
    $items=$arr[2];
    $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
    array_walk($items,function(&$item) use ($url_main){
        $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
        return $item;
    });
    $items=array_filter($items);
    $items=array_unique($items);
    return json_encode($items,320);

}

/**
 * @param $url
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return string
 */
function curl_downcss($url='',$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'css';
    $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
    $ch=curl_init();
    $fp=fopen($file_path,'w');
    $arr=array(
        CURLOPT_URL => $url,
        CURLOPT_CUSTOMREQUEST => strtoupper($method),
//        CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//        CURLOPT_NOPROGRESS => 0,
        CURLOPT_HEADER => 0,
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_FILE => $fp,
    );
    curl_setopt_array($ch,$arr);
    $output=curl_exec($ch);
    $size=filesize($file_path);
    $info=curl_getinfo($ch);
    if (curl_errno($ch)) {
        var_dump(curl_errno($ch));
        exit;
        fclose($fp);
        unlink($file_path);
        return curl_error($ch);
    } elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
        fclose($fp);
        unlink($file_path);
        return '資料不完整';
    }
    return 'ok';
}

/**
 * @param $arrs
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return void
 */
function curl_downcss_multi($arrs=array(),$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $conn=array();
    $file_path=array();
    $fp=array();
    $mh=curl_multi_init();
    foreach ($arrs as $k=>$v) {
        $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'css';
        $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
        $conn[$k]=curl_init();
        $fp[$k]=fopen($file_path[$k],'w');
        $arr=array(
            CURLOPT_URL => $v,
            CURLOPT_CUSTOMREQUEST => strtoupper($method),
//            CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//            CURLOPT_NOPROGRESS => 0,
            CURLOPT_HEADER => 0,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_SSL_VERIFYHOST => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_CONNECTTIMEOUT => 60,
            CURLOPT_TIMEOUT => 60,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_FILE => $fp[$k],
        );
        curl_setopt_array($conn[$k],$arr);
        curl_multi_add_handle($mh,$conn[$k]);
    }
    $active = null;
    do {
        curl_multi_exec($mh, $active);
        static $i=0;
        static $ok=0;
        while ($done=curl_multi_info_read($mh)) {
            if (curl_errno($done['handle'])) {
                fclose($fp[$i]);
                unlink($file_path[$i]);
                curl_multi_remove_handle($mh,$done['handle']);
                curl_close($done['handle']);
                continue;
            }
            $info=curl_getinfo($done['handle']);
            $size=filesize($file_path[$i]);
            if ($info['http_code'] != '200') {
                fclose($fp[$i]);
                unlink($file_path[$i]);
            }
            curl_multi_remove_handle($mh,$done['handle']);
            curl_close($done['handle']);
            ++$i;
            ++$ok;
        }
    } while ($active > 0);
    curl_multi_close($mh);
    echo 'ok: '.$ok;
}

// ============ js ============

function curl_js($url='') {
    $ch=curl_init();
    $array=array(
        CURLOPT_URL => $url,
        CURLOPT_ENCODING => 'gzip,deflate',
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_HTTPHEADER => array(
            'pragma: no-cache',
            'cache-control: no-cache',
            'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
            'accept: application/json, text/plain, */*',
            'content-type: application/json',
            'sec-ch-ua-mobile: ?0',
            'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'sec-ch-ua-platform: "Windows"',
            'sec-fetch-site: same-origin',
            'sec-fetch-mode: cors',
            'sec-fetch-dest: empty',
            'accept-language: zh-CN,zh;q=0.9'
        )
    );
    curl_setopt_array($ch,$array);
    $output=curl_exec($ch);
    if (curl_errno($ch)) {
        return curl_error($ch);
    }
    curl_close($ch);
    preg_match_all('/<script(.*)src="(\S+(.js)?)"/i',$output,$arr); // 規則自己根據實際情況定義
    $items=$arr[2];
    $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
    array_walk($items,function(&$item) use ($url_main){
        $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
        return $item;
    });
    $items=array_filter($items);
    $items=array_unique($items);
    return json_encode($items,320);

}

/**
 * @param $url
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return string
 */
function curl_downjs($url='',$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'';
    $ext=strpos('.'.$ext,'.js')===false?'':'js';
    $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
    $ch=curl_init();
    $fp=fopen($file_path,'w');
    $arr=array(
        CURLOPT_URL => $url,
        CURLOPT_CUSTOMREQUEST => strtoupper($method),
//        CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//        CURLOPT_NOPROGRESS => 0,
        CURLOPT_HEADER => 0,
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_FILE => $fp,
    );
    curl_setopt_array($ch,$arr);
    $output=curl_exec($ch);
//    $output=mb_convert_encoding(curl_exec($ch),'utf-8','GB2312'); // 編碼轉換 utf-8 轉 GB2312
    $size=filesize($file_path);
    $info=curl_getinfo($ch);
    if (curl_errno($ch)) {
        fclose($fp);
        unlink($file_path);
        return curl_error($ch);
    } elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
        fclose($fp);
        unlink($file_path);
        return '資料不完整';
    }
    return 'ok';
}

/**
 * @param $arrs
 * @param $dir
 * @param $method
 * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複
 * @return void
 */
function curl_downjs_multi($arrs=array(),$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $conn=array();
    $file_path=array();
    $fp=array();
    $mh=curl_multi_init();
    foreach ($arrs as $k=>$v) {
        $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'js';
        $ext=strpos('.'.$ext,'.js')===false?'':'js';
//        var_dump($ext);
//        exit;
        $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
        $conn[$k]=curl_init();
        $fp[$k]=fopen($file_path[$k],'w');
        $arr=array(
            CURLOPT_URL => $v,
            CURLOPT_CUSTOMREQUEST => strtoupper($method),
//            CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//            CURLOPT_NOPROGRESS => 0,
            CURLOPT_HEADER => 0,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_SSL_VERIFYHOST => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_CONNECTTIMEOUT => 60,
            CURLOPT_TIMEOUT => 60,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_FILE => $fp[$k],
        );
        curl_setopt_array($conn[$k],$arr);
        curl_multi_add_handle($mh,$conn[$k]);
    }
    $active = null;
    do {
        curl_multi_exec($mh, $active);
        static $i=0;
        static $ok=0;
        while ($done=curl_multi_info_read($mh)) {
            if (curl_errno($done['handle'])) {
                fclose($fp[$i]);
                unlink($file_path[$i]);
                curl_multi_remove_handle($mh,$done['handle']);
                curl_close($done['handle']);
                continue;
            }
            $info=curl_getinfo($done['handle']);
            $size=filesize($file_path[$i]);
            if ($info['http_code'] != '200') {
                fclose($fp[$i]);
                unlink($file_path[$i]);
            }
            curl_multi_remove_handle($mh,$done['handle']);
            curl_close($done['handle']);
            ++$i;
            ++$ok;
        }
    } while ($active > 0);
    curl_multi_close($mh);
    echo 'ok: '.$ok;
}