Jsoup爬取CSDN部落格
阿新 • • 發佈:2018-11-25
個人Jsoup練習之作,只做參考:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
package CSDN;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* @Author: BaiDing
* @Date: 2018/9/24 19:14
*/
public class JsopDemo {
private static final String BASE_URL = "https://blog.csdn.net/liujun03/article/list/";
private static int ARTICLE_SORT=0;
private int getAllPageCount(){
int count = 0;
try {
Document doc = Jsoup.connect(BASE_URL).get();
Elements scriptList = doc.select("script");
for(Element script : scriptList){
String text = script.data ();
if(text.contains("getAllUrl")){
String[] splitArray = text.split(";");
int pageSize = 0;
int listTotal = 0;
for(String split : splitArray){
if(split.contains("pageSize")){
pageSize = Integer.valueOf(split.split("=")[1].trim());
}
if(split.contains("listTotal")){
listTotal = Integer.valueOf(split.split("=")[1].trim());
}
}
count = listTotal%pageSize == 0 ?listTotal/pageSize:listTotal/pageSize+1;
}
}
} catch (IOException e) {
e.printStackTrace();
}
return count;
}
private void testJsop(int page,BufferedWriter bw) {
String url = BASE_URL + page;
try {
Document doc = Jsoup.connect(url).get();
Elements articleDiv = doc.select("div.article-list");
Elements articleList = articleDiv.select("div.csdn-tracking-statistics");
for(Element article : articleList){
String linkUrl= article.select("a").first().attr("href");
if(linkUrl.contains("liujun")) {
ARTICLE_SORT++;
String linkTitle = article.select("a").first().text();
Elements num= article.select("p span");
String date = num.get(0).text();
String readNum = num.get(1).text();
String reviewNum = num.get(2).text();
StringBuilder data = new StringBuilder()
.append(ARTICLE_SORT)
.append(" 標題: ")
.append(linkTitle)
.append(" , 連結: ")
.append(linkUrl)
.append("\t\n")
.append("建立時間: ")
.append(date)
.append(" , ")
.append(readNum)
.append(" , ")
.append(reviewNum);
bw.write(data+"\t\n");
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args){
JsopDemo jsopDemo = new JsopDemo();
try{
DateFormat bf = new SimpleDateFormat("yyyyMMddHHmmss");
String fileName =bf.format(new Date())+".txt";
File file = new File("D:/CSDN/"+fileName);
if(!file.exists()){
file.createNewFile();
}
FileOutputStream fos=new FileOutputStream(file);
OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw=new BufferedWriter(osw);
int count = jsopDemo.getAllPageCount();
for(int i=1;i<=count;i++){
jsopDemo.testJsop(i,bw);
}
bw.close();
osw.close();
fos.close();
}catch (Exception e){
e.printStackTrace();
}
}
}