node學習(四)——讀取頁面資料(簡單例子)
阿新 • • 發佈:2019-01-11
簡單的node爬蟲,這裡用到cheerio模組 他能夠很方便的操作dom ,我這裡讀取了百度搜索的內容
test1.js
const http = require('http'); const cheerio = require('cheerio'); const express = require('express'); const path = require('path'); const app = express(); app.use(express.static(path.join(__dirname))); app.get('/', function (req, res, next) { res.json({ 'name': 'test' }) }); app.get('/getData', function (req, res) { let params = req.query?req.query:{}; console.log('params',params); getData(params,function (data) { res.json({ 'ok': true, 'name': 'test', 'data':data }); }); }); app.listen('9999', function () { console.log('伺服器啟動 監聽9999埠'); }); function getData(params,callback) { let url = 'http://www.baidu.com/s?wd=word&rsv_spt=1&rsv_iqid=0xde0f6c3c000402ed&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&oq=aa&inputT=547&rsv_t=94d3Z7tMbG%2BpuEnOPrHr1UamFC8HB4%2FDItvkDQxqcgl9VqrcHCHrYr1wf1iLMCQxg5II&rsv_pq=f47fcffb00042e7a&rsv_sug3=7&rsv_sug1=6&rsv_sug7=100&rsv_sug2=0&rsv_sug4=547&rsv_sug=2'; url = params.url||url; let listData = {}; http.get(url, function (res) { let html = ''; res.on('data', function (data) { html += data; }); //結束 res.on('end', function () { // let slideListData = filter(html); //獲取資料 listData = filter(html); if(callback){ callback(listData); } }) }).on('error', function () { console.log('獲取資料出錯'); }); /* 過濾頁面資訊 */ function filter(html) { if (html) { // 沿用JQuery風格,定義$ let $ = cheerio.load(html); // 根據id獲取輪播圖列表資訊 let listLeft = $('#content_left'); let listRight = $('#content_right'); let listData = { left: [], right: [] }; listLeft.find('.c-container').each(function (i, item) { if (i < 3) { let $item = $(this); let text = $item.find('.t a').text(); let src = $item.find('.t a').attr('href'); listData.left.push({ text: text ? text.replace(/\r\n/g) : '', src: src ? filterSpecialChars(src) : '' }); } }); listRight.find('.FYB_RD tbody').eq(0).find('tr').each(function (i, item) { if (i < 3) { let $item = $(this); let index = $item.find('span .c-index').text(); let desc = $item.find('span a').text(); let src = url + $item.find('span a').attr('href'); listData.right.push({ index: index, desc: desc, src: src ? filterSpecialChars(src) : '', }); } }); listData['img'] = filterSpecialChars($('#lg').find('img').attr('src')); return listData; } else { console.log('無資料傳入!'); } } function printInfo(listData) { // 計數 console.log('listData', listData); } function filterSpecialChars(str) { //去掉換行 回車 空格 return str.replace(/\/\r\n/g, ''); } return listData; } index.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> test <script src="jquery-3.0.0.min.js"></script> <script> function getData(){ $.ajax({ url:'http://127.0.0.1:9999/getData', data:{ }, success:function (data) { console.log('data',data); } }) } getData(); </script> </body> </html>
檔案目錄是: