1. 程式人生 > 實用技巧 >spring boot+vue實現爬取各大平臺每日熱榜資料功能

spring boot+vue實現爬取各大平臺每日熱榜資料功能

案例功能效果圖
爬去資料的平臺頁面

這個案例能爬取的平臺太多了,我沒有全部截圖出來,想看的你們自己下載原始碼自己跑起來!
爬取的熱榜資料效果圖

環境介紹
前端:vue+h5
後端:springboot+webMagic
jdk:1.8及以上
資料庫:mysql

完整原始碼獲取方式
原始碼獲取方式:點選這裡,暗號部落格園!

核心程式碼介紹
pom.xml

<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>18.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.4</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.4</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 程式碼省略工具-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.8</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>

        <!-- swagger2 -->
        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger2</artifactId>
            <version>2.9.1</version>
        </dependency>

        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger-ui</artifactId>
            <version>2.9.1</version>
        </dependency>

application.yml

server:
  port: 9004
spring:
  jackson:
    serialization:
      write-dates-as-timestamps: true
  datasource:
    driverClassName: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC
    username: feimeidehuoji
    password: feimeidehuoji
  jpa:
    database: MySQL
    show-sql: true
    hibernate:
      ddl-auto: update
    database-platform: org.hibernate.dialect.MySQL5InnoDBDialect
spiderUrl: https://tophub.today
proxyUrl: 61.160.210.234
proxyPort: 808

NodeController.java

package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.Result;
import cn.cesi.webMagic.util.StatusCode;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiParam;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.domain.Page;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;

@RestController
@CrossOrigin
@RequestMapping("/node")
@Api(value = "獲取資料介面",tags={"使用者登入介面"})
public class NodeController {

    @Value("${spiderUrl}")
    private String url;

    @Value("${proxyUrl}")
    private String proxyUrl;

    @Value("${proxyPort}")
    private Integer proxyPort;

    @Resource
    NodeService nodeService;

    @Autowired
    SpringPieline springPieline;

    @RequestMapping("")
    @ApiOperation(value = "查詢資料介面")
    public Result getData(
            @ApiParam(value = "分類名稱", required = false) String typeName
            ,@ApiParam(value = "分類名稱", required = false) String secondTitle
            ,@ApiParam(value = "當前頁", required = false)Integer page
            ,@ApiParam(value = "每頁資料條數", required = false)Integer size){
        Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size);
        Result result = new Result();
        result.setFlag(true);
        result.setCode(StatusCode.OK);
        result.setMsg("查詢成功!");
        result.setData(nodes);
        return result;
    }

    @RequestMapping("/getType")
    @ApiOperation(value = "查詢全部分類列表")
    public Result getData(){
        List<Map<String,String>> list = nodeService.findType();
        Result result = new Result();
        result.setFlag(true);
        result.setCode(StatusCode.OK);
        result.setMsg("查詢成功!");
        result.setData(list);
        return result;
    }
    @Scheduled(fixedDelay = 480000) //1000*60*8 任務執行完成後10分鐘繼續執行
    public void tasks(){
        System.out.println("定時任務開始——————————————————————————————————");
        //設定代理伺服器
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort)));
        Spider.create(new WebProcess())
                .addUrl(url)
                .setDownloader(httpClientDownloader)
                .thread(2)  //執行緒(程式爬取速度)
                .addPipeline(springPieline) //指定pieline介面
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10)))
                .run();

        System.out.println("定時任務結束——————————————————————————————————");
    }
}

WebProcess.java

package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.util.NodeEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;

import org.jsoup.select.Elements;
import java.util.*;

@Component
public class WebProcess implements PageProcessor {


    @Override
    public void process(Page page) {
        System.out.println(page.getHtml());
        //page頁面物件,getHtml()獲取頁面的html ,css()選擇器 div#Sortable 獲取id為Sortable的div元素 nodes()轉為集合
        List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();

        List<NodeEntity> nodes = new ArrayList<>();
        for(Selectable selectable : list){
            //regex 正則表示式
//          String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //標題
            //Jsoup.parse解析html為dom元素(物件)語法同js語法 text()為js語法不多解釋
            //獲取title大標題
            String s = selectable.css("div.cc-cd-ih div a div span").toString();
            String title = "";
            if(s != null){
                title = Jsoup.parse(s).text();
            }
            //獲取logo
            String logo = selectable.css("div.cc-cd-ih div a div img").toString();
            String logoSrc = "";
            if(logo != null){
                Document document = Jsoup.parse(logo);
                Elements imgTags = document.select("img[src]");
                logoSrc = imgTags.attr("src");
            }

            //獲取第二層小標題的集合
            List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();
            List<Map<String,String>> maps = new ArrayList<>();
            for(Selectable selectable2 :list2){
                Map<String,String> map = new HashMap<>();
                //獲取二級標題的連結
                String url = selectable2.links().toString();
                //獲取二級標題
                String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();
                //獲取文章熱度
                String hot = "";
                if(selectable2.css("div span.e") != null){
                    hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();
                }

                map.put("url",url);
                map.put("secondTitle",secondTitle);
                map.put("hot",hot);
                maps.add(map);

                //將連線新增入任務中
                //page.addTargetRequest(url);
            }
            NodeEntity node = new NodeEntity();
            node.setTitle(title);
            node.setLogo(logoSrc);
            node.setMaps(maps);
            nodes.add(node);
        }

        //給page物件繫結物件
        page.putField("nodes",nodes);

    }

    private Site site = Site.me()
            .setSleepTime(2)//抓取間隔時間,可以解決一些反爬限制
            .setRetryTimes(3)   //重試次數
            .setRetrySleepTime(10000)  //重試時間
            .setTimeOut(60000)  //超時時間 1000*60 1分鐘
            .setCharset("utf8");
    @Override
    public Site getSite() {
        return site;
    }
}

SpringPieline.java

package cn.cesi.webMagic.pieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.IdWorker;
import cn.cesi.webMagic.util.NodeEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.*;

//存入資料庫
@Component
public class SpringPieline implements Pipeline {
    @Autowired
    NodeService nodeService;

    @Autowired
    IdWorker idWorker;

    @Override
    public void process(ResultItems resultItems, Task task) {
        List<NodeEntity> nodes = resultItems.get("nodes");
        try{
            for(NodeEntity entity : nodes){
                Node node = new Node();
                String title = entity.getTitle();
                node.setTitle(title);
                String logo = entity.getLogo();
                node.setLogo(logo);
                List<Map<String,String>> list = entity.getMaps();
                for(Map<String,String> map : list){
                    node.setId(idWorker.nextId()+"");
                    String secondTitle = map.get("secondTitle").trim();
                    node.setSecondTitle(secondTitle);
                    node.setUrl(map.get("url"));
                    node.setCreateDate(new Date());
                    node.setHot(map.get("hot"));
                    System.out.println(secondTitle);
                    if(!secondTitle.equals("") && !title.equals("")){
                        List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);
                        if(byTitleAndSecondTitle.size() <= 0){
                            nodeService.save(node);
                        }
                    }

                }
            }
        }catch (Exception e){
            System.out.println(e);
        }

    }
}

index.vue

<template>
  <div class="tab__content">
    <h1 class="page__title">摸魚熱榜</h1>
    <van-search
      v-model="value"
      placeholder="請輸入搜尋關鍵詞"
      @search="onSearch"
      @clear="onClear"
    />

    <!-- 分類列表 -->
    <div v-if="!listData.length">
      <div class="tab__tips">
        仿今日熱榜!,關注java專案開發,學習更多案例!
      </div>
      <div class="cells-block">
        <div>
          <div class="cells__title">全部熱榜</div>
          <div class="cells">
            <div
              v-for="(item, index) in typeList"
              :key="index"
              class="cell-row"
            >
              <div class="cell" @click="goDateils(item)">
                <div class="cell__hd">
                  <img
                    :src="item.logo"
                    :alt="item.title"
                    @error="imgError(item)"
                  />
                </div>
                <div class="cell__bd">{{ item.title }}</div>
                <div class="cell__ft">
                  <svg-icon
                    iconClass="index_right"
                    className="icon_search"
                  ></svg-icon>
                </div>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <!-- 搜尋內容 -->
    <div v-if="listData.length">
      <search-list v-if="listData.length" :list="listData" />
      <van-empty v-else description="暫無相關內容!" />
    </div>
  </div>
</template>

<script>
import SvgIcon from '@/components/icon/SvgIcon';
import searchList from '@/components/searchList/list';
export default {
  components: {
    SvgIcon,
    searchList
  },
  data() {
    return {
      value: '', // 搜尋值
      listData: [], // 搜尋資料
      typeList: [], // 所有熱榜型別
      defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 預設型別圖片
    };
  },
  computed: {},
  created() {
    this.getAllType();
  },
  mounted() {},
  methods: {
    // 獲取全部熱榜型別
    getAllType() {
      const that = this;
      this.$api.getAllType().then(res => {
        if (res.code === 0) {
          that.typeList = res.data;
        }
      });
    },

    // 跳轉分類詳情
    goDateils(item) {
      this.$router.push({
        name: 'details',
        query: {
          item: JSON.stringify(item)
        }
      });
    },

    // 搜尋
    onSearch(e) {
      const that = this;
      let params = {
        typeName: '全部',
        size: 10000,
        secondTitle: e
      };
      this.$api.getAllInfoGzip(params).then(res => {
        if (res.code == 0) {
          that.listData = res.data.content;
          that.handleData(that.listData);
          console.log(res);
        }
      });
    },

    // 清除搜尋框
    onClear(e) {
      this.listData = [];
    },

    // 處理熱榜型別資料
    handleData(data) {
      data.forEach(item => {
        item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
        item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
      });
    },

    // 圖片404處理
    imgError(item) {
      // 圖片404就賦值預設圖片
      item.logo = this.defaultUrl;
    }
  }
};

details.vue

<template>
  <div class="topic-list">
    <div class="info-top">
      <img class="info-bg" :src="details.logo" @error="imgError" alt="" />
      <div class="info-content">
        <div class="top-column">
          <p @click="$router.push('/')">摸魚熱榜</p>
        </div>
        <img class="pic-icon" :src="details.logo" @error="imgError" alt="" />
        <h1 class="info-title">{{ details.title }}</h1>
      </div>
    </div>
    <div class="divider">
      <van-pull-refresh v-model="refreshing" @refresh="onRefresh">
        <van-list
          v-model="loading"
          :finished="finished"
          @load="onLoad"
          :immediate-check="false"
        >
          <div class="panel_bd">
            <a
              v-for="(item, index) in listData"
              :key="item.id"
              :href="item.url"
              class="media-box van-hairline--bottom"
            >
              <div class="media-box__bd">
                <h4 class="media-box__title">
                  {{ index + 1 }}、{{ item.secondTitle }}
                </h4>
                <div class="dec-row">
                  <span class="tag" v-if="item.hot">
                    <span>{{ item.hot }}</span>
                  </span>
                  <span class="time">
                    <span>{{ item.CreateTime }}</span>
                  </span>
                  <span class="new" v-if="item.new">新</span>
                </div>
              </div>
            </a>
          </div>
        </van-list>
      </van-pull-refresh>
    </div>
    <div class="footer-flag flex-center" v-if="finished">
      <p class="flex-center">我是有底線的</p>
    </div>
  </div>
</template>

<script>
export default {
  data() {
    return {
      page: 1, // 當前頁數
      refreshing: false, // 下拉重新整理狀態
      loading: false, // 上拉載入狀態
      finished: false, // 是否無更多資料狀態
      listData: [], // 資料列表
      details: {}, // 型別詳情
      defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 預設型別圖片
    };
  },
  computed: {},
  created() {},
  mounted() {
    this.details = JSON.parse(this.$route.query.item);
    this.getList(this.details, this.page);
  },
  methods: {
    // 分類詳情
    getList(item, page, loading = true) {
      const that = this;
      let list = that.listData;
      let params = {
        typeName: item.title,
        size: 50,
        page
      };
      this.$api.getAllInfoGzip(params, loading).then(res => {
        console.log(res);
        if (res.code == 0) {
          that.listData = list.concat(res.data.content);
          that.handleData(that.listData);
          // 上拉載入狀態結束
          if (that.loading) {
            that.loading = false;
          }
          // 下拉重新整理狀態結束
          if (that.refreshing) {
            that.refreshing = false;
          }
          // 暫無更多資料
          if (that.page >= res.data.totalPages) {
            that.finished = true;
          }
        }
      });
    },

    // 上拉載入
    onLoad() {
      // 請求狀態
      this.loading = true;
      this.getList(this.details, ++this.page, false);
    },

    // 下拉重新整理
    onRefresh() {
      // 請求狀態、清空列表資料
      this.finished = false;
      this.loading = true;
      this.listData = [];
      this.page = 1;
      this.getList(this.details, 1, false);
    },

    // 處理熱榜型別資料
    handleData(data) {
      data.forEach(item => {
        item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
        item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
      });
    },

    // 圖片404處理
    imgError() {
      // 圖片404就賦值預設圖片
      this.details.img = this.defaultUrl;
    }
  }
};

xxx.sql

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for node
-- ----------------------------
DROP TABLE IF EXISTS `node`;
CREATE TABLE `node` (
  `id` varchar(255) NOT NULL,
  `create_date` datetime DEFAULT NULL,
  `hot` varchar(1024) DEFAULT NULL,
  `second_title` longtext,
  `title` varchar(1024) DEFAULT NULL,
  `url` longtext,
  `logo` varchar(1024) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

作者:Java開發專案
連結:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
歡迎大家關注:有故事的程式設計師,每天更新Java技術知識點,還可以領取Java進階學習資料哦~
資料包含的模組分為19個模組,分別是: Java 基礎、容器、多執行緒、反射、物件拷貝、Java Web 、異常、網路、設計模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。