1. 程式人生 > >Jsoup爬取CSDN部落格

Jsoup爬取CSDN部落格

個人Jsoup練習之作,只做參考:

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.3</version>
</dependency>
package CSDN;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import java.io.*; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; /** * @Author: BaiDing * @Date: 2018/9/24 19:14 */ public class JsopDemo { private static final String BASE_URL = "https://blog.csdn.net/liujun03/article/list/"; private
static int ARTICLE_SORT=0; private int getAllPageCount(){ int count = 0; try { Document doc = Jsoup.connect(BASE_URL).get(); Elements scriptList = doc.select("script"); for(Element script : scriptList){ String text = script.data
(); if(text.contains("getAllUrl")){ String[] splitArray = text.split(";"); int pageSize = 0; int listTotal = 0; for(String split : splitArray){ if(split.contains("pageSize")){ pageSize = Integer.valueOf(split.split("=")[1].trim()); } if(split.contains("listTotal")){ listTotal = Integer.valueOf(split.split("=")[1].trim()); } } count = listTotal%pageSize == 0 ?listTotal/pageSize:listTotal/pageSize+1; } } } catch (IOException e) { e.printStackTrace(); } return count; } private void testJsop(int page,BufferedWriter bw) { String url = BASE_URL + page; try { Document doc = Jsoup.connect(url).get(); Elements articleDiv = doc.select("div.article-list"); Elements articleList = articleDiv.select("div.csdn-tracking-statistics"); for(Element article : articleList){ String linkUrl= article.select("a").first().attr("href"); if(linkUrl.contains("liujun")) { ARTICLE_SORT++; String linkTitle = article.select("a").first().text(); Elements num= article.select("p span"); String date = num.get(0).text(); String readNum = num.get(1).text(); String reviewNum = num.get(2).text(); StringBuilder data = new StringBuilder() .append(ARTICLE_SORT) .append(" 標題: ") .append(linkTitle) .append(" , 連結: ") .append(linkUrl) .append("\t\n") .append("建立時間: ") .append(date) .append(" , ") .append(readNum) .append(" , ") .append(reviewNum); bw.write(data+"\t\n"); } } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args){ JsopDemo jsopDemo = new JsopDemo(); try{ DateFormat bf = new SimpleDateFormat("yyyyMMddHHmmss"); String fileName =bf.format(new Date())+".txt"; File file = new File("D:/CSDN/"+fileName); if(!file.exists()){ file.createNewFile(); } FileOutputStream fos=new FileOutputStream(file); OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8"); BufferedWriter bw=new BufferedWriter(osw); int count = jsopDemo.getAllPageCount(); for(int i=1;i<=count;i++){ jsopDemo.testJsop(i,bw); } bw.close(); osw.close(); fos.close(); }catch (Exception e){ e.printStackTrace(); } } }