nodejs + request + cheerio 抓取頁面指定的資訊
專案是建立在在express 框架下 express可參考 http://www.expressjs.com.cn/starter/hello-world.html
1.引入cheerio模組 npm install --save cheerio
2. 引入iconv-lite模組 中文轉碼用 npm install --save iconv-lite
var express = require('express');
const cheerio = require('cheerio');
const request = require('request');
const iconv = require('iconv-lite');
var router = express.Router();
//採集地址:https://detail.tmall.com/item.htm?id=525432847421 獲取 商品和規格資訊
router.post('/addUrl', function(req, res, next) {
//res.send('respond with a resource');var param = req.body;// req.query || req.params get提交獲取引數 ; req.body post提交的引數
var url = param.goods_url;
var req = request({url:url,encoding:null}, function(error, response, body) {
if (!error && response.statusCode == 200) {
var buf = iconv.decode(body, 'gb2312');//獲取內容進行轉碼
//console.log(response) // 列印頁面
$ = cheerio.load(buf);
// $('.tm-clear .tb-property').remove();
// $('.tm-clear .tb-gallery').remove(); //刪除元素
var script = $('#J_FrmBid').next().next().next(); //同級下一個元素
// var script_arr = $('.tm-clear>script');//獲取子標籤<script>
// console.log(script);
var _html = script.html();
var reg=/\{\"api\"[\s\S]*\"valTimeLeft\"\:[0-9]*\}/g; //正則匹配{"api": 下面的內容到 "valTimeLeft":XXX} 結束;
var html = _html.match(reg);
var json_obj = JSON.parse(html); //字串轉換為json格式
console.log(json_obj.itemDO); //列印商品資訊
res.json({
code:'1',
msg: '操作成功'
});
}
});
//res.render('goods/result',data);
})
module.exports = router;