1. 程式人生 > 其它 >Node爬取網站資料

Node爬取網站資料

npm安裝cheerio和axios

npm isntall cheerio npm install axios

利用cheerio抓取對應網站中的標籤根據連結使用axios獲取對應頁面資料

const cheerio = require('cheerio');
//獲取HTML文件的內容
const request = require('request');
const {writeFile,mkDir} = require('../lcf');
const fs = require('fs');
const url = require('url');
const path = require("path");
const axios = require("axios");
const { title } = require('process');
const httpUrl = "https://www.pkdoutu.com/article/list";

async function wait(millSeconds) {
    return new Promise((resolve,reject)=>{
        setTimeout(()=>{
            resolve("success");
        },millSeconds);
    })
}
function req(options) {
    return new Promise((resolve,reject) => {
        request.get(url,headers,function(err,response,body) {
            if(err) {
                reject(err);
            } else {
                resolve({response,body});
            }
        })
    })
}

async function getPageNum() {
    const { response,body } = await req(httpUrl);
    const $ = cheerio.load(body);
    const linkLength = $('.page-link').length;
    let num = $('.page-link').eq(linkLength - 2).text();
    return num;
}

async function getImgList() {
    let allNum = await getPageNum();
    for(let i=1;i<=10;i++) {
        if(i!=1) {
            await wait(3000*i);
        }
        parsePage(httpUrl+"?page="+i)
    }
}
async function parsePage(url) {
    let { response,body } = await req(url);
    let $ = cheerio.load(body);
    let reg = /(.*?)\d/i;
    $('#home .col-sm-9>a').each(async (index,ele) => {
        let pageUrl = $(ele).attr('href');
        let title = $(ele).find('.random_title').text();
        title = reg.exec(title)[1];
        let reg2 = /[/*?:<>|\"\\\\]+/g;
        title = title.replace(reg2,"");
        console.log(title)
        if(!fs.existsSync("img/"+title)) {
            mkDir("img/"+title);
            console.log("建立目錄成功:",title);
            getImg(pageUrl,title);
        }
    })
}

async function getImg(pageUrl,title) {
    let { response,body } = await req(pageUrl);
    let $ = cheerio.load(body);
    $('.pic-content img').each(async (index,ele) =>{
        await wait(50*index);
        let imgUrl = $(ele).attr('src');
        let extName = path.extname(imgUrl);
        //建立檔案寫入流
        let imgPath = `img/${title}/${title}-${index}${extName}`;
        let ws = fs.createWriteStream(imgPath);
        axios.get(imgUrl,{responseType: 'stream'}).then(res => {
            res.data.pipe(ws);
            console.log("圖片載入完成:" + imgPath)
        })          
      })
    };

getImgList();

如果對應網站有SSL證書,可以利用puppeteer模擬瀏覽器進行操作

npm install puppeteer

let puppeteer = require('puppeteer');

async function test() {
    let options = {
        defaultViewport: {
            width: 1400,
            height: 800
        },
        headless: false
    }

    let browser = await puppeteer.launch(options);
    let page = await browser.newPage();
    await page.goto('https://www.taobao.com/');
    await page.screenshot({path: "screenshot.png"});
    //$eval 返回一個promise物件
    //$page 返回一個elementHandle
    // let elementArr = await page.$$eval(".service-bd a",(elements)=>{
    //$獲取一個元素 $$獲取多個元
    //$waitfor 等待元素完成
       let elementArr = await page.$$(".service-bd a",(elementArr) => {
        let eles = [];
        elements.forEach((item,index) => {
            if(item.getAttribute("href")!="#") {
                var eleobj = {
                    href: item.getAttribute("href"),
                    text: item.innerHTML
                }
                eles.push(eleobj);
            }
        })
        return eles;
    })

    let searchInput = await page.$('#q',(ele) => {
        return ele;
    })

    await searchInput.focus();
    await page.keyboard.type("檯燈");

    let searchBtn = await page.$(".btn-search",(ele) => elementArr)
    await searchBtn.click();

    // let loginId = await page.$("#fm-login-id",(ele) => ele);
    // let loginPassword = await page.$("#fm-login-password",(ele) => ele);
    
    // setTimeout(async ()=>{
    //     await loginId.focus();
    //     await page.keyboard.type("[email protected]");
    //     setTimeout(async ()=>{
    //         await loginPassword.focus();
    //         await page.keyboard.type("a13851467182");
    //     },4000)
        
    // },4000);
   

    page.on('console',function(eventMsg){
        console.log(eventMsg.text());
    })
}

test();