nodejs+cheerio 爬蟲入門
阿新 • • 發佈:2019-02-17
var http = require('http'); var cheerio = require('cheerio'); var url = require('url'); var zlib = require('zlib'); var fs = require('fs'); //www.imooc.com和www.zngirls.com網站都支援gzip格式 //所以導致認為接收的資料是錯誤的,在頭中Accept-Encoding中設定的gzip var learn_url = 'http://www.imooc.com/learn/348'; //var learn_url = 'http://jquery.com/download/'; //var learn_url = 'http://www.zngirls.com/girl/18071/album/'; var opt = { host: 'proxy3.bj.petrochina', port: 8080, path: learn_url, headers: { //頭資訊寫入太多了好像接收到的資料不對 //僅增加User-Agent頭沒有問題 +Host頭 //如果沒有Host的頭'http://www.imooc.com/learn/348',網址返回301,可能是因為使用了代理的緣故 //使用代理必須使用Host的頭選項 //Accept-Encoding影響了,估計是返回的是gzip壓縮的html而導致的資料不正確 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', //就是這個問題,如果去掉gzip就沒問題,保持gzip就會自動返回壓縮格式的資料流,由客戶端瀏覽器來進行解壓縮 'Accept-Encoding': 'gzip, deflate, sdch', //'Accept-Encoding': 'deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', Host: url.parse(learn_url).host, //'Proxy-Connection': 'keep-alive', //'Upgrade-Insecure-Requests': '1', //Referer: 'www.zngirls.com', //'Cookie':'PHPSESSID=jmlqem3eh4me74m8ommfuekb74; imooc_uuid=2410bb30-858b-4212-96eb-20abca48cb80; imooc_isnew=1; imooc_isnew_ct=1474595643; IMCDNS=0; Hm_lvt_f0cfcccd7b1393990c78efdeebff3968=1474595648; Hm_lpvt_f0cfcccd7b1393990c78efdeebff3968=1474595770; cvde=57e48b3b517cd-3' }, } /**************************** 列印得到的資料結構 [{ chapterTitle:'', videos:[{ title:'', id:'' }] }] ********************************/ function printCourseInfo(courseData) { console.log(courseData); courseData.forEach(function (item) { var chapterTitle = item.chapterTitle; console.log(chapterTitle + '\n'); item.videos.forEach(function (video) { console.log(' [' + video.id + ']' + video.title + '\n'); }) }); } /************* 分析從網頁裡抓取到的資料 **************/ function filterChapter(html) { var courseData = []; var $ = cheerio.load(html); var chapters = $('.chapter'); chapters.each(function (item) { var chapter = $(this); var chapterTitle = chapter.find('strong').text(); //找到章節標題 var videos = chapter.find('.video').children('li'); var chapterData = { chapterTitle: chapterTitle, videos: [] }; videos.each(function (item) { var video = $(this).find('.studyvideo'); var title = video.text(); var id = 0; //var id = video.attr('href').split('/video')[1]; chapterData.videos.push({ title: title, id: id }) }) courseData.push(chapterData); }); return courseData; } http.get(opt, function (res) { var html = ''; console.log(`re.status_code=${res.statusCode}`); res.on('data', function (data) { html += data.toString('utf-8'); //console.log(`html=${html}`); //console.log(html.length); }); res.on('end', function () { if(res.headers['content-encoding'] === 'gzip'){ //解壓縮資料 var file = fs.createWriteStream('test.html'); var buffer = new Buffer(html, 'utf-8'); var gunzipStream = zlib.createGzip(); //buffer.pipe(gunzipStream).pipe(file); //file.pipe(gzipStream).pipe(res); //html = zlib.inflate(buffer); } console.log(`html=${html}`); //var courseData = filterChapter(html); //printCourseInfo(courseData); }); }).on('error', function () { console.log('獲取課程資料出錯'); });