java的爬蟲子+遞迴分頁爬
需要的jar
/**
* jsoup-1.11.3.jar
* jsoup-1.11.3-javadoc.jar
* jsoup-1.11.3-sources.jar
*jsoup-proxy.jar
*/
package com.service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.jdbc.MySQLDemo;
public class JsoupTest {
//起始頁數
static int n = 0;
static String str = "";
static List<String> sqlList = new ArrayList<>();
private static long time=1000L;
public static void main(String[] args) throws IOException, InterruptedException {
// url標籤地址集合
List<String> listURL = new ArrayList<String>();
// listURL.add("http://www.xinli001.com/qa/emotion");// 情緒
// listURL.add("http://www.xinli001.com/qa/love");// 戀愛
// listURL.add("http://www.xinli001.com/qa/behavior");// 行為
// listURL.add("http://www.xinli001.com/qa/marriage/");// 婚姻
// listURL.add("http://www.xinli001.com/qa/career/");// 職業
// listURL.add("http://www.xinli001.com/qa/communication");// 人際
// listURL.add("http://www.xinli001.com/qa/self/");// 成長
listURL.add("http://www.xinli001.com/qa/health/p71");// 心理健康
listURL.add("http://www.xinli001.com/qa/science/p71");// 科普
listURL.add("http://www.xinli001.com/qa/children/p71");// 父母子女
// 遍歷每個集合
for (String strURL : listURL) {
n = 71;
if(strURL.contains("health")) {
str="health";
}else if(strURL.contains("science")) {
str="science";
}else if(strURL.contains("children")) {
str="children";
}
first_Category(strURL);
MySQLDemo.insertMethod(sqlList);
}
// System.out.println(sqlList);
}
/**
* 類別爬
*
* @param strURL
* 類別地址
* @throws IOException
* @throws InterruptedException
*/
private static void first_Category(String strURL) throws IOException, InterruptedException {
// 模擬火狐瀏覽器
Document document = Jsoup.connect(strURL)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1;Trident/5.0)").get();
Elements select = document.select("div.ask_lists");
if ("".equals(select.text().trim())) {
System.out.println(strURL + "--->>>isPageEnd......");
return;
}
if(n % 10==0) {
System.out.println("當前頁數為:"+n+",已爬完,下次從"+(n+1)+"頁開始爬++++++++++");
return;
}
logInfo(strURL, "開始");
// 開始爬
Elements elementsByClass = document.getElementsByClass("ask_show");
int i = 0;
for (Element element : elementsByClass) {
Elements links = element.getElementsByTag("a");
Element element2 = links.get(0);
String linkHref = element2.attr("href");
// String linkText = element2.text();
// System.out.println("linkHref:"+linkHref);
// System.out.println("linkText:"+linkText);
// 問題爬
second_Category(linkHref, i + 1);
// 每個問題爬完休眠一會,否則會出現請求量太大
Thread.sleep(time);
// if(sqlList.size()>1) {
// return;
// }
i++;
}
System.out.println("數量為:【" + i + "】個");
logInfo(strURL, "結束");
if (strURL.contains("/p")) {
strURL = strURL.substring(0, strURL.lastIndexOf("/"));
}
String strURL2 = strURL + "/p" + (++n);
first_Category(strURL2);
}
/**
* 問題爬
*
* @param strURL
* 問題地址
* @throws IOException
*/
private static void second_Category(String linkHref, int num) throws IOException {
Document document1 = Jsoup.connect(linkHref)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1;Trident/5.0)").get();
// 表頭
Elements tagTitle = document1.select("p.title");
String titleText = tagTitle.select("span").text();
String answersCount = tagTitle.select("strong").text();
// if("三觀不和,不想回到過去,如今我懷孕要8個月了我好無助".equals(titleText)) {
// return;
// }
System.out.println("===============【標題" + num + "】===============" + titleText);
// System.out.println("回答數量-->>" + answersCount);
// 文字體
titleText=titleText.replaceAll("'", "");
Elements tagText = document1.select("p.text");
String text = tagText.text();
text=text.replaceAll("'", "");
System.out.println("【詳細問題" + num + "】" + text);
// 回答者
Elements answerInfo = document1.select("ul.content-ans");
for (Element element : answerInfo) {
Elements select = element.select("li");
int a = 1;
for (Element element2 : select) {
String username = element2.select("span.username").text();
String answer = element2.select("p.text").text();
// if("I'm fine.".equals(username)) {
// continue;
// }
System.out.println("【回答者" + a + "】" + username);
System.out.println("【回答者" + a + "答覆】" + answer);
username=username.replaceAll("'", "");
answer=answer.replaceAll("'", "");
String sql = "";
if(str.equals("health")) {
sql = "INSERT INTO health(title,detail,answerUserName,answerMessage) VALUES('" + titleText + "','"
+ text + "','" + username + "','" + answer + "')";
}else if(str.equals("science")) {
sql = "INSERT INTO science(title,detail,answerUserName,answerMessage) VALUES('" + titleText + "','"
+ text + "','" + username + "','" + answer + "')";
}else if(str.equals("children")) {
sql = "INSERT INTO children(title,detail,answerUserName,answerMessage) VALUES('" + titleText + "','"
+ text + "','" + username + "','" + answer + "')";
}else {
System.out.println("程式執行錯誤********************");
return;
}
sqlList.add(sql);
// if(sqlList.size()>1) {
// return;
// }
}
}
}
/**
* 日誌
*
* @param strURL
* @param flag
*/
private static void logInfo(String strURL, String flag) {
String name = "";
if (strURL.contains("emotion")) {
name = "情緒";
} else if (strURL.contains("love")) {
name = "戀愛";
} else if (strURL.contains("behavior")) {
name = "行為";
} else if (strURL.contains("marriage")) {
name = "婚姻";
} else if (strURL.contains("career")) {
name = "職業";
} else if (strURL.contains("communication")) {
name = "人際";
} else if (strURL.contains("self")) {
name = "成長";
} else if (strURL.contains("health")) {
name = "心理健康";
} else if (strURL.contains("science")) {
name = "科普";
} else if (strURL.contains("children")) {
name = "父母子女";
}
System.out.println("+++++++++++++++++++++++++++++【" + name + "】爬" + flag + ",地址【" + strURL
+ "】+++++++++++++++++++++++++++++");
}
}
package com.jdbc;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
public class MySQLDemo {
// JDBC 驅動名及資料庫 URL
static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
static final String DB_URL = "jdbc:mysql://localhost:3306/thisdingblog";
// 資料庫的使用者名稱與密碼,需要根據自己的設定
static final String USER = "root";
static final String PASS = "root";
public static void main(String[] args) {
List<String> sqlList = new ArrayList<String>();
String str = "nihao";
String text = "text";
String username = "username";
String answer = "answer";
String titleText = "titleText";
String sql2 = "INSERT INTO emotion(title,detail,answerUserName,answerMessage)VALUES('" + str + "',2,3,4)";
String sql1 = "INSERT INTO emotion(title,detail,answerUserName,answerMessage) VALUES('" + titleText + "','"
+ text + "','" + username + "','" + answer + "')";
sqlList.add(sql1);
sqlList.add(sql2);
insertMethod(sqlList);
}
public static void insertMethod(List<String> sqlList) {
Connection conn = null;
Statement stmt = null;
try {
// 註冊 JDBC 驅動
Class.forName("com.mysql.jdbc.Driver");
// 開啟連結
conn = DriverManager.getConnection(DB_URL, USER, PASS);
// 執行查詢
stmt = conn.createStatement();
conn.setAutoCommit(false);
Long start = System.currentTimeMillis();
for (String sql : sqlList) {
stmt.addBatch(sql);
}
stmt.executeBatch();
Long end1 = System.currentTimeMillis();
System.out.println("sql全部測試耗時:" + (end1 - start) / 1000f + "秒!");
conn.commit();
} catch (SQLException se) {
// 處理 JDBC 錯誤
se.printStackTrace();
throw new RuntimeException(se);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
} finally {
if (stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
if (conn != null) {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
}
}