Node爬取網站資料
阿新 • • 發佈:2022-03-13
npm安裝cheerio和axios
npm isntall cheerio
npm install axios
利用cheerio抓取對應網站中的標籤根據連結使用axios獲取對應頁面資料
const cheerio = require('cheerio'); //獲取HTML文件的內容 const request = require('request'); const {writeFile,mkDir} = require('../lcf'); const fs = require('fs'); const url = require('url'); const path = require("path"); const axios = require("axios"); const { title } = require('process'); const httpUrl = "https://www.pkdoutu.com/article/list"; async function wait(millSeconds) { return new Promise((resolve,reject)=>{ setTimeout(()=>{ resolve("success"); },millSeconds); }) } function req(options) { return new Promise((resolve,reject) => { request.get(url,headers,function(err,response,body) { if(err) { reject(err); } else { resolve({response,body}); } }) }) } async function getPageNum() { const { response,body } = await req(httpUrl); const $ = cheerio.load(body); const linkLength = $('.page-link').length; let num = $('.page-link').eq(linkLength - 2).text(); return num; } async function getImgList() { let allNum = await getPageNum(); for(let i=1;i<=10;i++) { if(i!=1) { await wait(3000*i); } parsePage(httpUrl+"?page="+i) } } async function parsePage(url) { let { response,body } = await req(url); let $ = cheerio.load(body); let reg = /(.*?)\d/i; $('#home .col-sm-9>a').each(async (index,ele) => { let pageUrl = $(ele).attr('href'); let title = $(ele).find('.random_title').text(); title = reg.exec(title)[1]; let reg2 = /[/*?:<>|\"\\\\]+/g; title = title.replace(reg2,""); console.log(title) if(!fs.existsSync("img/"+title)) { mkDir("img/"+title); console.log("建立目錄成功:",title); getImg(pageUrl,title); } }) } async function getImg(pageUrl,title) { let { response,body } = await req(pageUrl); let $ = cheerio.load(body); $('.pic-content img').each(async (index,ele) =>{ await wait(50*index); let imgUrl = $(ele).attr('src'); let extName = path.extname(imgUrl); //建立檔案寫入流 let imgPath = `img/${title}/${title}-${index}${extName}`; let ws = fs.createWriteStream(imgPath); axios.get(imgUrl,{responseType: 'stream'}).then(res => { res.data.pipe(ws); console.log("圖片載入完成:" + imgPath) }) }) }; getImgList();
如果對應網站有SSL證書,可以利用puppeteer模擬瀏覽器進行操作
npm install puppeteer
let puppeteer = require('puppeteer'); async function test() { let options = { defaultViewport: { width: 1400, height: 800 }, headless: false } let browser = await puppeteer.launch(options); let page = await browser.newPage(); await page.goto('https://www.taobao.com/'); await page.screenshot({path: "screenshot.png"}); //$eval 返回一個promise物件 //$page 返回一個elementHandle // let elementArr = await page.$$eval(".service-bd a",(elements)=>{ //$獲取一個元素 $$獲取多個元 //$waitfor 等待元素完成 let elementArr = await page.$$(".service-bd a",(elementArr) => { let eles = []; elements.forEach((item,index) => { if(item.getAttribute("href")!="#") { var eleobj = { href: item.getAttribute("href"), text: item.innerHTML } eles.push(eleobj); } }) return eles; }) let searchInput = await page.$('#q',(ele) => { return ele; }) await searchInput.focus(); await page.keyboard.type("檯燈"); let searchBtn = await page.$(".btn-search",(ele) => elementArr) await searchBtn.click(); // let loginId = await page.$("#fm-login-id",(ele) => ele); // let loginPassword = await page.$("#fm-login-password",(ele) => ele); // setTimeout(async ()=>{ // await loginId.focus(); // await page.keyboard.type("[email protected]"); // setTimeout(async ()=>{ // await loginPassword.focus(); // await page.keyboard.type("a13851467182"); // },4000) // },4000); page.on('console',function(eventMsg){ console.log(eventMsg.text()); }) } test();