1. 程式人生 > >nodejs+cheerio 爬蟲入門

nodejs+cheerio 爬蟲入門

var http = require('http');
var cheerio = require('cheerio');
var url = require('url');
var zlib = require('zlib');
var fs = require('fs');

//www.imooc.com和www.zngirls.com網站都支援gzip格式
//所以導致認為接收的資料是錯誤的,在頭中Accept-Encoding中設定的gzip
var learn_url = 'http://www.imooc.com/learn/348';
//var learn_url = 'http://jquery.com/download/';
//var learn_url = 'http://www.zngirls.com/girl/18071/album/';

var opt = {
    host: 'proxy3.bj.petrochina',
    port: 8080,
    path: learn_url,
    headers: {
        //頭資訊寫入太多了好像接收到的資料不對
        //僅增加User-Agent頭沒有問題 +Host頭
        //如果沒有Host的頭'http://www.imooc.com/learn/348',網址返回301,可能是因為使用了代理的緣故
        //使用代理必須使用Host的頭選項
        //Accept-Encoding影響了,估計是返回的是gzip壓縮的html而導致的資料不正確
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        //就是這個問題,如果去掉gzip就沒問題,保持gzip就會自動返回壓縮格式的資料流,由客戶端瀏覽器來進行解壓縮
        'Accept-Encoding': 'gzip, deflate, sdch',
        //'Accept-Encoding': 'deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Cache-Control': 'max-age=0',
        Host: url.parse(learn_url).host,
        //'Proxy-Connection': 'keep-alive',
        //'Upgrade-Insecure-Requests': '1',
        //Referer: 'www.zngirls.com',
        //'Cookie':'PHPSESSID=jmlqem3eh4me74m8ommfuekb74; imooc_uuid=2410bb30-858b-4212-96eb-20abca48cb80; imooc_isnew=1; imooc_isnew_ct=1474595643; IMCDNS=0; Hm_lvt_f0cfcccd7b1393990c78efdeebff3968=1474595648; Hm_lpvt_f0cfcccd7b1393990c78efdeebff3968=1474595770; cvde=57e48b3b517cd-3'
    },
}
/****************************
 列印得到的資料結構
 [{
chapterTitle:'',
videos:[{
title:'',
id:''
}]
}]
 ********************************/
function printCourseInfo(courseData) {
    console.log(courseData);
    courseData.forEach(function (item) {
        var chapterTitle = item.chapterTitle;
        console.log(chapterTitle + '\n');
        item.videos.forEach(function (video) {
            console.log(' [' + video.id + ']' + video.title + '\n');
        })
    });
}


/*************
 分析從網頁裡抓取到的資料
 **************/
function filterChapter(html) {
    var courseData = [];

    var $ = cheerio.load(html);
    var chapters = $('.chapter');
    chapters.each(function (item) {
        var chapter = $(this);
        var chapterTitle = chapter.find('strong').text(); //找到章節標題
        var videos = chapter.find('.video').children('li');

        var chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        };

        videos.each(function (item) {
            var video = $(this).find('.studyvideo');
            var title = video.text();
            var id = 0;
            //var id = video.attr('href').split('/video')[1];

            chapterData.videos.push({
                title: title,
                id: id
            })
        })

        courseData.push(chapterData);
    });

    return courseData;
}

http.get(opt, function (res) {
    var html = '';
    console.log(`re.status_code=${res.statusCode}`);
    res.on('data',
        function (data) {
            html += data.toString('utf-8');
            //console.log(`html=${html}`);
            //console.log(html.length);
        });

    res.on('end',
        function () {
            if(res.headers['content-encoding'] === 'gzip'){
               //解壓縮資料
                var file = fs.createWriteStream('test.html');
                var buffer = new Buffer(html, 'utf-8');
                var gunzipStream = zlib.createGzip();
                //buffer.pipe(gunzipStream).pipe(file);
                //file.pipe(gzipStream).pipe(res);
                //html = zlib.inflate(buffer);
           }

            console.log(`html=${html}`);
            //var courseData = filterChapter(html);
            //printCourseInfo(courseData);
        });
}).on('error', function () {
    console.log('獲取課程資料出錯');
});