使用phpquerylist爬取csdn文章
阿新 • • 發佈:2018-11-30
- composer安裝phpquerylist4
在專案目錄下建立composer.json檔案:
{
"require": {
"jaeger/querylist": "^4.0"
}
}
命令列執行命令:
composer install
就可以得到vendor資料夾,檔案結構如下圖:
2、建立方法,採集、儲存資料
<?php /** * @Filename: index.php * @desc:資料採集 */ require('autoload.php'); use QL\QueryList; set_time_limit(0); header("Content-type:text/html;charset=utf-8"); class catchInfo { private $conn; private $host = '127.0.0.1'; private $password = 'root'; private $username = 'root'; private $dbname = 'test'; private $url = 'http://so.csdn.net/so/search/s.do?'; private $path = '../images/'; public function __construct() { $this->conn = mysqli_connect($this->host,$this->username,$this->password,$this->dbname); } public function searchdata($keywords,$page) { if(empty($keywords)){ $keywords = $_REQUEST['keywords']?:'php';//關鍵字 } if(empty($page)){ $page = $_REQUEST['page']?:1;//頁碼 } phpQuery::$defaultCharset="utf-8"; $html = $this->url.'p='.$page.'&q='.$keywords; $rule = array( "url"=>array('.search-link a','href'), ); $hj = QueryList::get($html)->rules($rule)->queryData(); $i = 0; foreach($hj as $v){ if(strpos($v['url'],'blog.csdn') && strpos($v['url'],'article/details')){ $article_url = $v['url']; $sql = "select id from ay_content where outlink = '$article_url'"; $res = mysqli_query($this->conn,$sql); //匹配資料庫中已爬取的url if($res->num_rows == 0){ //爬取規則 $source = QueryList::get($article_url); $title = $source->find('.article-title-box h1')->text(); $author = $source->find('.article-bar-top a')->text(); $content = $source->find('.blog-content-box article')->html(); $content = addslashes(htmlspecialchars($content)); $sql = "insert into ay_content(title,author,content,outlink,sorting) values('$title','$author','$content','$article_url',$page)"; $res = mysqli_query($this->conn,$sql); //自增 if($res){ $i++; } //一次採集10條 if($i == 10){ exit; } } } } //一次採集10條 if($i < 10){ $this->searchdata($keywords,$page+1); } } /** * @Function show_aritcle 資料回顯 * @Return: void */ public function show_aritcle() { $sql = "select content from ay_content where id = 1"; $res = mysqli_query($this->conn,$sql); var_dump(htmlspecialchars_decode(mysqli_fetch_row($res)[0])); } } if(!empty($_POST)){ $type = $_POST['type']?:''; $page = $_POST['page']?:''; $keywords = $_POST['keywords']?:''; } $obj = new catchInfo(); if(!empty($type)){ $obj->$type($keywords,$page); } ?> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>資料採集</title> </head> <body> <form action="" method="post"> <input type="hidden" name="type" value="searchdata"> 關鍵詞:<input type="text" name="keywords"> 頁碼:<input type="number" name="page"> <input type="submit" value="提交"> </form> </body> </html>
3、頁面效果
4、採集結果
-
宣告:本人所採集CSDN文章僅為學習用途,並未用於任何盈利性商業目的。
-
說明:在採集過程中存在一些不盡如人意的地方,文章中的圖片我想過多種辦法下載下來,然後採用本地的圖片地址進行替換,但都失敗了,先是用的str_replace()函式,將文章中的圖片連結地址替換為本地圖片儲存的相對路徑,失敗了,後來我再用正則匹配圖片連結地址,只能匹配部分連結,因為文章中的圖片來源也有可能是第三方網址,匹配失敗。若是有朋友能解釋下str_replace無法替換的原因或者提供一個合適的正則匹配,在下就先行謝過了!