JAVA 爬取新聞網站的資料,httpclient和jsoup。
阿新 • • 發佈:2018-12-21
建立maven工程目錄:
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>maven</groupId> <artifactId>maven</artifactId> <version>1.0-SNAPSHOT</version> <packaging>war</packaging> <name>maven Maven Webapp</name> <!-- FIXME change it to the project's website --> <url>http://www.example.com</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.18</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> </dependencies> <build> <finalName>maven</finalName> <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) --> <plugins> <plugin> <artifactId>maven-clean-plugin</artifactId> <version>3.0.0</version> </plugin> <!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_war_packaging --> <plugin> <artifactId>maven-resources-plugin</artifactId> <version>3.0.2</version> </plugin> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.7.0</version> </plugin> <plugin> <artifactId>maven-surefire-plugin</artifactId> <version>2.20.1</version> </plugin> <plugin> <artifactId>maven-war-plugin</artifactId> <version>3.2.0</version> </plugin> <plugin> <artifactId>maven-install-plugin</artifactId> <version>2.5.2</version> </plugin> <plugin> <artifactId>maven-deploy-plugin</artifactId> <version>2.8.2</version> </plugin> </plugins> </pluginManagement> </build> </project>
jdbc.properties:
driver=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost/db_database18?characterEncoding=utf-8
username=root
password=root
propertiesUtil.java:
package cn.clay.httpclient.utils.test; import java.io.IOException; import java.util.Properties; /** * @author * @date 2018/12/20 - 21:37 */ public class PropertiesUtil { static Properties properties = new Properties(); public PropertiesUtil() { } public static boolean loadFile(String fileName){ try { properties.load(PropertiesUtil.class.getClassLoader().getResourceAsStream(fileName)); } catch (IOException e) { e.printStackTrace(); return false; } return true; } public static String getPropertyValue(String key){ return properties.getProperty(key); } }
StockUtils.java:
package cn.clay.httpclient.utils.test; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.ResponseHandler; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; /** * @author ClayZhang * */ public class StockUtils { public static String getHtmlByUrl(final String url, final String charset) throws IOException { /*RequestConfig defaultRequestConfig = RequestConfig.custom() .setConnectTimeout(5000) .setConnectionRequestTimeout(5000) .build();*/ //CloseableHttpClient httpclient = HttpClients.custom().setMaxConnTotal(800).setMaxConnPerRoute(800).setDefaultRequestConfig(defaultRequestConfig).build(); CloseableHttpClient httpclient = HttpClients.createDefault(); try { HttpGet httpget = new HttpGet(url); //System.out.println("executing request " + httpget.getURI()); ResponseHandler<String> responseHandler = new ResponseHandler<String>() { public String handleResponse(final HttpResponse response) throws ClientProtocolException, IOException { int status = response.getStatusLine().getStatusCode(); //System.out.println("========responseStatusCode:"+status + " "+url); if (status == 200) { HttpEntity entity = response.getEntity(); if (entity == null) { System.out.println("========entity is null:" + status + " " + url); return null; } else { String content = EntityUtils.toString(entity); if (charset != null) { content = new String(content.getBytes("ISO-8859-1"), charset); } return content; } } else { throw new ClientProtocolException("Unexpected response status: " + status); } } }; String responseBody = httpclient.execute(httpget, responseHandler); return responseBody; } catch (ClientProtocolException e) { System.out.println("========ClientProtocolException====" + e.getMessage() + " " + url); //e.printStackTrace(); return getHtmlByUrl(url, charset); } catch (IOException e) { System.out.println("========IOException====" + e.getMessage() + " " + url); //e.printStackTrace(); return getHtmlByUrl(url, charset); } finally { httpclient.close(); } } }
接下來是用jsoup的select選擇器篩選出我們學校官網資訊,並存入資料庫,jsoup的select選擇器用法。
StockTest.java:
package cn.clay.httpclient.utils.test;
import java.io.IOException;
import java.sql.*;
import org.apache.http.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author ClayZhang
*
*/
public class StockTest {
public static void main(String[] args) throws ParseException, IOException {
String content = StockUtils.getHtmlByUrl("https://www.zut.edu.cn/index/xwdt.htm", "utf-8");
parserHtml(content);
}
public static void parserHtml(String content) throws ParseException, IOException {
int id = 1;
Document doc = Jsoup.parse(content);
//Elements links = doc.getElementsByClass("winstyle67214").select("tr");
Elements links = doc.getElementsByClass("winstyle67214").select("tr[id^=line67]");
//line67214_0
for (Element e : links) {
String title = e.select("a").text().toString();
System.out.println("新聞標題:" + title);
//獲取頁面連結
Elements linkHref = e.select("a");
String url = "https://www.zut.edu.cn" + linkHref.attr("href");
System.out.println("新聞連結:" + url);
//擷取時間字串
Elements timeStr = e.select("span[class=timestyle67214]");
String time = timeStr.text();
System.out.println("釋出時間:" + time);
//System.out.println("釋出作者:畢明理" );
insert(id,title, url, time);
//id++;
}
}
private static void insert(int id, String title, String newsurl, String time) {
Connection con = null;
PreparedStatement pstm = null;
PropertiesUtil.loadFile("jdbc.properties");
String driver = PropertiesUtil.getPropertyValue("driver");
String url = PropertiesUtil.getPropertyValue("url");
String username = PropertiesUtil.getPropertyValue("username");
String password = PropertiesUtil.getPropertyValue("password");
try {
Class.forName(driver);
con = DriverManager.getConnection(url,username,password);
String author = "畢明理";
String sql = "insert into tb_news(title,datetime,author,newsurl) values (?,?,?,?)";
pstm = con.prepareStatement(sql);
//pstm.setInt(1, id);
pstm.setString(1, title);
pstm.setString(2, time);
pstm.setString(3, author);
pstm.setString(4, newsurl);
pstm.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} finally {
if (con != null) {
try {
con.close();
} catch (SQLException e) {
e.printStackTrace();
}
if (pstm != null) {
try {
pstm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
}
}
輸出結果:
存入資料庫: