php curl 抓一個頁面,包含 imgs、css、js
阿新 • • 發佈:2022-04-11
宣告
本分享純屬為了技術分享,禁止商用!!!禁止商用!!!禁止商用!!!
未經本人允許,如有發現,違者必究!!!
呼叫
ini_set('max_execution_time', 20); $url='http://xxx.cn/'; $html_name='index'; // html 名稱 $type='pc'; // pc電腦 m手機 $dir=dirname(__FILE__).DIRECTORY_SEPARATOR.parse_url($url)['host'].DIRECTORY_SEPARATOR; if ($type=='m') $static='statics_m'.DIRECTORY_SEPARATOR.$html_name.DIRECTORY_SEPARATOR; else $static='statics'.DIRECTORY_SEPARATOR.$html_name.DIRECTORY_SEPARATOR; $fh=file_get_contents($url); if ($fh && !is_dir($dir)) mkdir($dir,0777,true); $fh && file_put_contents($dir.$html_name.'.html',$fh) or die('內容為空'); $dir_imgs=$dir.$static.'imgs'.DIRECTORY_SEPARATOR; $dir_css=$dir.$static.'css'.DIRECTORY_SEPARATOR; $dir_js=$dir.$static.'js'.DIRECTORY_SEPARATOR; $re_imgs=curl_img($url); $re_imgs=json_decode($re_imgs,true); $re_css=curl_css($url); $re_css=json_decode($re_css,true); $re_js=curl_js($url); $re_js=json_decode($re_js,true); // 批量 curl_downimg_multi($re_imgs,$dir_imgs,'GET','1'); curl_downcss_multi($re_css,$dir_css,'GET','1'); curl_downjs_multi($re_js,$dir_js,'GET','1');
方法
// ============ imgs ============ function curl_img($url='') { $ch=curl_init(); $array=array( CURLOPT_URL => $url, CURLOPT_ENCODING => 'gzip,deflate', CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_HTTPHEADER => array( 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', 'accept: application/json, text/plain, */*', 'content-type: application/json', 'sec-ch-ua-mobile: ?0', 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36', 'sec-ch-ua-platform: "Windows"', 'sec-fetch-site: same-origin', 'sec-fetch-mode: cors', 'sec-fetch-dest: empty', 'accept-language: zh-CN,zh;q=0.9' ) ); curl_setopt_array($ch,$array); $output=curl_exec($ch); if (curl_errno($ch)) { return curl_error($ch); } curl_close($ch); $r=preg_match_all('/data-src="(.*)"/',$output,$arr); // 規則自己根據實際情況定義 $items=$arr[1]; $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host']; array_walk($items,function(&$item) use ($url_main){ $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item)); return $item; }); $items=array_filter($items); $items=array_unique($items); return json_encode($items,320); } /** * @param $url * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return string */ function curl_downimg($url='',$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'jpg'; $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $ch=curl_init(); $fp=fopen($file_path,'w'); $arr=array( CURLOPT_URL => $url, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_FILE => $fp, ); curl_setopt_array($ch,$arr); $output=curl_exec($ch); $size=filesize($file_path); $info=curl_getinfo($ch); if (curl_errno($ch)) { fclose($fp); unlink($file_path); return curl_error($ch); } elseif ($info['http_code'] != '200' || $size != $info['size_download']) { fclose($fp); unlink($file_path); return '資料不完整'; } return 'ok'; } /** * @param $arrs * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return void */ function curl_downimg_multi($arrs=array(),$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $conn=array(); $file_path=array(); $fp=array(); $mh=curl_multi_init(); foreach ($arrs as $k=>$v) { $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'jpg'; $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $conn[$k]=curl_init(); $fp[$k]=fopen($file_path[$k],'w'); $arr=array( CURLOPT_URL => $v, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_CONNECTTIMEOUT => 60, CURLOPT_TIMEOUT => 60, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FILE => $fp[$k], ); curl_setopt_array($conn[$k],$arr); curl_multi_add_handle($mh,$conn[$k]); } $active = null; do { curl_multi_exec($mh, $active); static $i=0; static $ok=0; while ($done=curl_multi_info_read($mh)) { if (curl_errno($done['handle'])) { fclose($fp[$i]); unlink($file_path[$i]); curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); continue; } $info=curl_getinfo($done['handle']); $size=filesize($file_path[$i]); if ($info['http_code'] != '200') { fclose($fp[$i]); unlink($file_path[$i]); } curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); ++$i; ++$ok; } } while ($active > 0); curl_multi_close($mh); echo 'ok: '.$ok; } // ============ css ============ function curl_css($url='') { $ch=curl_init(); $array=array( CURLOPT_URL => $url, CURLOPT_ENCODING => 'gzip,deflate', CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_HTTPHEADER => array( 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', 'accept: application/json, text/plain, */*', 'content-type: application/json', 'sec-ch-ua-mobile: ?0', 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36', 'sec-ch-ua-platform: "Windows"', 'sec-fetch-site: same-origin', 'sec-fetch-mode: cors', 'sec-fetch-dest: empty', 'accept-language: zh-CN,zh;q=0.9' ) ); curl_setopt_array($ch,$array); $output=curl_exec($ch); if (curl_errno($ch)) { return curl_error($ch); } curl_close($ch); preg_match_all('/<link(.*) href="(\S+\.css)"/',$output,$arr); // 規則自己根據實際情況定義 $items=$arr[2]; $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host']; array_walk($items,function(&$item) use ($url_main){ $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item)); return $item; }); $items=array_filter($items); $items=array_unique($items); return json_encode($items,320); } /** * @param $url * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return string */ function curl_downcss($url='',$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'css'; $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $ch=curl_init(); $fp=fopen($file_path,'w'); $arr=array( CURLOPT_URL => $url, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_FILE => $fp, ); curl_setopt_array($ch,$arr); $output=curl_exec($ch); $size=filesize($file_path); $info=curl_getinfo($ch); if (curl_errno($ch)) { var_dump(curl_errno($ch)); exit; fclose($fp); unlink($file_path); return curl_error($ch); } elseif ($info['http_code'] != '200' || $size != $info['size_download']) { fclose($fp); unlink($file_path); return '資料不完整'; } return 'ok'; } /** * @param $arrs * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return void */ function curl_downcss_multi($arrs=array(),$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $conn=array(); $file_path=array(); $fp=array(); $mh=curl_multi_init(); foreach ($arrs as $k=>$v) { $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'css'; $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $conn[$k]=curl_init(); $fp[$k]=fopen($file_path[$k],'w'); $arr=array( CURLOPT_URL => $v, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_CONNECTTIMEOUT => 60, CURLOPT_TIMEOUT => 60, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FILE => $fp[$k], ); curl_setopt_array($conn[$k],$arr); curl_multi_add_handle($mh,$conn[$k]); } $active = null; do { curl_multi_exec($mh, $active); static $i=0; static $ok=0; while ($done=curl_multi_info_read($mh)) { if (curl_errno($done['handle'])) { fclose($fp[$i]); unlink($file_path[$i]); curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); continue; } $info=curl_getinfo($done['handle']); $size=filesize($file_path[$i]); if ($info['http_code'] != '200') { fclose($fp[$i]); unlink($file_path[$i]); } curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); ++$i; ++$ok; } } while ($active > 0); curl_multi_close($mh); echo 'ok: '.$ok; } // ============ js ============ function curl_js($url='') { $ch=curl_init(); $array=array( CURLOPT_URL => $url, CURLOPT_ENCODING => 'gzip,deflate', CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_HTTPHEADER => array( 'pragma: no-cache', 'cache-control: no-cache', 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', 'accept: application/json, text/plain, */*', 'content-type: application/json', 'sec-ch-ua-mobile: ?0', 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36', 'sec-ch-ua-platform: "Windows"', 'sec-fetch-site: same-origin', 'sec-fetch-mode: cors', 'sec-fetch-dest: empty', 'accept-language: zh-CN,zh;q=0.9' ) ); curl_setopt_array($ch,$array); $output=curl_exec($ch); if (curl_errno($ch)) { return curl_error($ch); } curl_close($ch); preg_match_all('/<script(.*)src="(\S+(.js)?)"/i',$output,$arr); // 規則自己根據實際情況定義 $items=$arr[2]; $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host']; array_walk($items,function(&$item) use ($url_main){ $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item)); return $item; }); $items=array_filter($items); $items=array_unique($items); return json_encode($items,320); } /** * @param $url * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return string */ function curl_downjs($url='',$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:''; $ext=strpos('.'.$ext,'.js')===false?'':'js'; $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $ch=curl_init(); $fp=fopen($file_path,'w'); $arr=array( CURLOPT_URL => $url, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_FILE => $fp, ); curl_setopt_array($ch,$arr); $output=curl_exec($ch); // $output=mb_convert_encoding(curl_exec($ch),'utf-8','GB2312'); // 編碼轉換 utf-8 轉 GB2312 $size=filesize($file_path); $info=curl_getinfo($ch); if (curl_errno($ch)) { fclose($fp); unlink($file_path); return curl_error($ch); } elseif ($info['http_code'] != '200' || $size != $info['size_download']) { fclose($fp); unlink($file_path); return '資料不完整'; } return 'ok'; } /** * @param $arrs * @param $dir * @param $method * @param $type 命名規則 1=原始檔名 2=隨機命名,不會重複 * @return void */ function curl_downjs_multi($arrs=array(),$dir='',$method='GET',$type='1') { if (!is_dir($dir)) { mkdir($dir,0777,true); } $conn=array(); $file_path=array(); $fp=array(); $mh=curl_multi_init(); foreach ($arrs as $k=>$v) { $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'js'; $ext=strpos('.'.$ext,'.js')===false?'':'js'; // var_dump($ext); // exit; $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext; $conn[$k]=curl_init(); $fp[$k]=fopen($file_path[$k],'w'); $arr=array( CURLOPT_URL => $v, CURLOPT_CUSTOMREQUEST => strtoupper($method), // CURLOPT_PROGRESSFUNCTION => 'progressCallback', // CURLOPT_NOPROGRESS => 0, CURLOPT_HEADER => 0, CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_CONNECTTIMEOUT => 60, CURLOPT_TIMEOUT => 60, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FILE => $fp[$k], ); curl_setopt_array($conn[$k],$arr); curl_multi_add_handle($mh,$conn[$k]); } $active = null; do { curl_multi_exec($mh, $active); static $i=0; static $ok=0; while ($done=curl_multi_info_read($mh)) { if (curl_errno($done['handle'])) { fclose($fp[$i]); unlink($file_path[$i]); curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); continue; } $info=curl_getinfo($done['handle']); $size=filesize($file_path[$i]); if ($info['http_code'] != '200') { fclose($fp[$i]); unlink($file_path[$i]); } curl_multi_remove_handle($mh,$done['handle']); curl_close($done['handle']); ++$i; ++$ok; } } while ($active > 0); curl_multi_close($mh); echo 'ok: '.$ok; }