spring boot+vue實現爬取各大平臺每日熱榜資料功能
阿新 • • 發佈:2020-12-23
案例功能效果圖
爬去資料的平臺頁面
這個案例能爬取的平臺太多了,我沒有全部截圖出來,想看的你們自己下載原始碼自己跑起來!
爬取的熱榜資料效果圖
環境介紹
前端:vue+h5
後端:springboot+webMagic
jdk:1.8及以上
資料庫:mysql
完整原始碼獲取方式
原始碼獲取方式:點選這裡,暗號部落格園!
核心程式碼介紹
pom.xml
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <!-- https://mvnrepository.com/artifact/com.google.guava/guava --> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 程式碼省略工具--> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.8</version> <scope>provided</scope> </dependency> <!-- https://mvnrepository.com/artifact/junit/junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!-- swagger2 --> <dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger2</artifactId> <version>2.9.1</version> </dependency> <dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger-ui</artifactId> <version>2.9.1</version> </dependency>
application.yml
server: port: 9004 spring: jackson: serialization: write-dates-as-timestamps: true datasource: driverClassName: com.mysql.cj.jdbc.Driver url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC username: feimeidehuoji password: feimeidehuoji jpa: database: MySQL show-sql: true hibernate: ddl-auto: update database-platform: org.hibernate.dialect.MySQL5InnoDBDialect spiderUrl: https://tophub.today proxyUrl: 61.160.210.234 proxyPort: 808
NodeController.java
package cn.cesi.webMagic.webMagic; import cn.cesi.webMagic.pieline.SpringPieline; import cn.cesi.webMagic.pojo.Node; import cn.cesi.webMagic.service.NodeService; import cn.cesi.webMagic.util.Result; import cn.cesi.webMagic.util.StatusCode; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import io.swagger.annotations.ApiParam; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.data.domain.Page; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.web.bind.annotation.CrossOrigin; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler; import javax.annotation.Resource; import java.util.List; import java.util.Map; @RestController @CrossOrigin @RequestMapping("/node") @Api(value = "獲取資料介面",tags={"使用者登入介面"}) public class NodeController { @Value("${spiderUrl}") private String url; @Value("${proxyUrl}") private String proxyUrl; @Value("${proxyPort}") private Integer proxyPort; @Resource NodeService nodeService; @Autowired SpringPieline springPieline; @RequestMapping("") @ApiOperation(value = "查詢資料介面") public Result getData( @ApiParam(value = "分類名稱", required = false) String typeName ,@ApiParam(value = "分類名稱", required = false) String secondTitle ,@ApiParam(value = "當前頁", required = false)Integer page ,@ApiParam(value = "每頁資料條數", required = false)Integer size){ Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查詢成功!"); result.setData(nodes); return result; } @RequestMapping("/getType") @ApiOperation(value = "查詢全部分類列表") public Result getData(){ List<Map<String,String>> list = nodeService.findType(); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查詢成功!"); result.setData(list); return result; } @Scheduled(fixedDelay = 480000) //1000*60*8 任務執行完成後10分鐘繼續執行 public void tasks(){ System.out.println("定時任務開始——————————————————————————————————"); //設定代理伺服器 HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort))); Spider.create(new WebProcess()) .addUrl(url) .setDownloader(httpClientDownloader) .thread(2) //執行緒(程式爬取速度) .addPipeline(springPieline) //指定pieline介面 .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10))) .run(); System.out.println("定時任務結束——————————————————————————————————"); } }
WebProcess.java
package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.util.NodeEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import org.jsoup.select.Elements;
import java.util.*;
@Component
public class WebProcess implements PageProcessor {
@Override
public void process(Page page) {
System.out.println(page.getHtml());
//page頁面物件,getHtml()獲取頁面的html ,css()選擇器 div#Sortable 獲取id為Sortable的div元素 nodes()轉為集合
List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();
List<NodeEntity> nodes = new ArrayList<>();
for(Selectable selectable : list){
//regex 正則表示式
// String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //標題
//Jsoup.parse解析html為dom元素(物件)語法同js語法 text()為js語法不多解釋
//獲取title大標題
String s = selectable.css("div.cc-cd-ih div a div span").toString();
String title = "";
if(s != null){
title = Jsoup.parse(s).text();
}
//獲取logo
String logo = selectable.css("div.cc-cd-ih div a div img").toString();
String logoSrc = "";
if(logo != null){
Document document = Jsoup.parse(logo);
Elements imgTags = document.select("img[src]");
logoSrc = imgTags.attr("src");
}
//獲取第二層小標題的集合
List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();
List<Map<String,String>> maps = new ArrayList<>();
for(Selectable selectable2 :list2){
Map<String,String> map = new HashMap<>();
//獲取二級標題的連結
String url = selectable2.links().toString();
//獲取二級標題
String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();
//獲取文章熱度
String hot = "";
if(selectable2.css("div span.e") != null){
hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();
}
map.put("url",url);
map.put("secondTitle",secondTitle);
map.put("hot",hot);
maps.add(map);
//將連線新增入任務中
//page.addTargetRequest(url);
}
NodeEntity node = new NodeEntity();
node.setTitle(title);
node.setLogo(logoSrc);
node.setMaps(maps);
nodes.add(node);
}
//給page物件繫結物件
page.putField("nodes",nodes);
}
private Site site = Site.me()
.setSleepTime(2)//抓取間隔時間,可以解決一些反爬限制
.setRetryTimes(3) //重試次數
.setRetrySleepTime(10000) //重試時間
.setTimeOut(60000) //超時時間 1000*60 1分鐘
.setCharset("utf8");
@Override
public Site getSite() {
return site;
}
}
SpringPieline.java
package cn.cesi.webMagic.pieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.IdWorker;
import cn.cesi.webMagic.util.NodeEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.*;
//存入資料庫
@Component
public class SpringPieline implements Pipeline {
@Autowired
NodeService nodeService;
@Autowired
IdWorker idWorker;
@Override
public void process(ResultItems resultItems, Task task) {
List<NodeEntity> nodes = resultItems.get("nodes");
try{
for(NodeEntity entity : nodes){
Node node = new Node();
String title = entity.getTitle();
node.setTitle(title);
String logo = entity.getLogo();
node.setLogo(logo);
List<Map<String,String>> list = entity.getMaps();
for(Map<String,String> map : list){
node.setId(idWorker.nextId()+"");
String secondTitle = map.get("secondTitle").trim();
node.setSecondTitle(secondTitle);
node.setUrl(map.get("url"));
node.setCreateDate(new Date());
node.setHot(map.get("hot"));
System.out.println(secondTitle);
if(!secondTitle.equals("") && !title.equals("")){
List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);
if(byTitleAndSecondTitle.size() <= 0){
nodeService.save(node);
}
}
}
}
}catch (Exception e){
System.out.println(e);
}
}
}
index.vue
<template>
<div class="tab__content">
<h1 class="page__title">摸魚熱榜</h1>
<van-search
v-model="value"
placeholder="請輸入搜尋關鍵詞"
@search="onSearch"
@clear="onClear"
/>
<!-- 分類列表 -->
<div v-if="!listData.length">
<div class="tab__tips">
仿今日熱榜!,關注java專案開發,學習更多案例!
</div>
<div class="cells-block">
<div>
<div class="cells__title">全部熱榜</div>
<div class="cells">
<div
v-for="(item, index) in typeList"
:key="index"
class="cell-row"
>
<div class="cell" @click="goDateils(item)">
<div class="cell__hd">
<img
:src="item.logo"
:alt="item.title"
@error="imgError(item)"
/>
</div>
<div class="cell__bd">{{ item.title }}</div>
<div class="cell__ft">
<svg-icon
iconClass="index_right"
className="icon_search"
></svg-icon>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- 搜尋內容 -->
<div v-if="listData.length">
<search-list v-if="listData.length" :list="listData" />
<van-empty v-else description="暫無相關內容!" />
</div>
</div>
</template>
<script>
import SvgIcon from '@/components/icon/SvgIcon';
import searchList from '@/components/searchList/list';
export default {
components: {
SvgIcon,
searchList
},
data() {
return {
value: '', // 搜尋值
listData: [], // 搜尋資料
typeList: [], // 所有熱榜型別
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 預設型別圖片
};
},
computed: {},
created() {
this.getAllType();
},
mounted() {},
methods: {
// 獲取全部熱榜型別
getAllType() {
const that = this;
this.$api.getAllType().then(res => {
if (res.code === 0) {
that.typeList = res.data;
}
});
},
// 跳轉分類詳情
goDateils(item) {
this.$router.push({
name: 'details',
query: {
item: JSON.stringify(item)
}
});
},
// 搜尋
onSearch(e) {
const that = this;
let params = {
typeName: '全部',
size: 10000,
secondTitle: e
};
this.$api.getAllInfoGzip(params).then(res => {
if (res.code == 0) {
that.listData = res.data.content;
that.handleData(that.listData);
console.log(res);
}
});
},
// 清除搜尋框
onClear(e) {
this.listData = [];
},
// 處理熱榜型別資料
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 圖片404處理
imgError(item) {
// 圖片404就賦值預設圖片
item.logo = this.defaultUrl;
}
}
};
details.vue
<template>
<div class="topic-list">
<div class="info-top">
<img class="info-bg" :src="details.logo" @error="imgError" alt="" />
<div class="info-content">
<div class="top-column">
<p @click="$router.push('/')">摸魚熱榜</p>
</div>
<img class="pic-icon" :src="details.logo" @error="imgError" alt="" />
<h1 class="info-title">{{ details.title }}</h1>
</div>
</div>
<div class="divider">
<van-pull-refresh v-model="refreshing" @refresh="onRefresh">
<van-list
v-model="loading"
:finished="finished"
@load="onLoad"
:immediate-check="false"
>
<div class="panel_bd">
<a
v-for="(item, index) in listData"
:key="item.id"
:href="item.url"
class="media-box van-hairline--bottom"
>
<div class="media-box__bd">
<h4 class="media-box__title">
{{ index + 1 }}、{{ item.secondTitle }}
</h4>
<div class="dec-row">
<span class="tag" v-if="item.hot">
<span>{{ item.hot }}</span>
</span>
<span class="time">
<span>{{ item.CreateTime }}</span>
</span>
<span class="new" v-if="item.new">新</span>
</div>
</div>
</a>
</div>
</van-list>
</van-pull-refresh>
</div>
<div class="footer-flag flex-center" v-if="finished">
<p class="flex-center">我是有底線的</p>
</div>
</div>
</template>
<script>
export default {
data() {
return {
page: 1, // 當前頁數
refreshing: false, // 下拉重新整理狀態
loading: false, // 上拉載入狀態
finished: false, // 是否無更多資料狀態
listData: [], // 資料列表
details: {}, // 型別詳情
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 預設型別圖片
};
},
computed: {},
created() {},
mounted() {
this.details = JSON.parse(this.$route.query.item);
this.getList(this.details, this.page);
},
methods: {
// 分類詳情
getList(item, page, loading = true) {
const that = this;
let list = that.listData;
let params = {
typeName: item.title,
size: 50,
page
};
this.$api.getAllInfoGzip(params, loading).then(res => {
console.log(res);
if (res.code == 0) {
that.listData = list.concat(res.data.content);
that.handleData(that.listData);
// 上拉載入狀態結束
if (that.loading) {
that.loading = false;
}
// 下拉重新整理狀態結束
if (that.refreshing) {
that.refreshing = false;
}
// 暫無更多資料
if (that.page >= res.data.totalPages) {
that.finished = true;
}
}
});
},
// 上拉載入
onLoad() {
// 請求狀態
this.loading = true;
this.getList(this.details, ++this.page, false);
},
// 下拉重新整理
onRefresh() {
// 請求狀態、清空列表資料
this.finished = false;
this.loading = true;
this.listData = [];
this.page = 1;
this.getList(this.details, 1, false);
},
// 處理熱榜型別資料
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 圖片404處理
imgError() {
// 圖片404就賦值預設圖片
this.details.img = this.defaultUrl;
}
}
};
xxx.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for node
-- ----------------------------
DROP TABLE IF EXISTS `node`;
CREATE TABLE `node` (
`id` varchar(255) NOT NULL,
`create_date` datetime DEFAULT NULL,
`hot` varchar(1024) DEFAULT NULL,
`second_title` longtext,
`title` varchar(1024) DEFAULT NULL,
`url` longtext,
`logo` varchar(1024) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
作者:Java開發專案
連結:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
歡迎大家關注:有故事的程式設計師,每天更新Java技術知識點,還可以領取Java進階學習資料哦~
資料包含的模組分為19個模組,分別是: Java 基礎、容器、多執行緒、反射、物件拷貝、Java Web 、異常、網路、設計模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。