java爬蟲 京東商品頁 簡單案例
阿新 • • 發佈:2019-02-11
要爬的資料
資料庫表結構
資料庫建表語句
專案的包結構
pom.xml 檔案中的jar包依賴
編寫實體類
spider類
Downloadable介面類
DownloadImpl實現類
PageUtil頁面工具類
Processable.java
ProcessImpl.java
ProcessImpl.java程式碼中的幾個注意點:
獲取商品名稱、圖片URL的xpath路徑
在京東商品頁面獲取商品價格的方式
得到如下的連線地址:
商品引數規格的Xpath
Storeable.java
StoreImple.java
MyDBUtils.java
在src/test/java資料夾下面的包中新建test類 TestSpider.java
執行test測試方法,在資料庫中插入了資料
資料庫表結構
資料庫建表語句
SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for `spider` -- ---------------------------- DROP TABLE IF EXISTS `spider`; CREATE TABLE `spider` ( `id` int(10) NOT NULL AUTO_INCREMENT, `goods_id` varchar(20) DEFAULT NULL, `data_url` varchar(300) DEFAULT NULL, `pic_url` varchar(300) DEFAULT NULL, `title` varchar(300) DEFAULT NULL, `price` varchar(10) DEFAULT NULL, `param` text, `current_time` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
專案的包結構
pom.xml 檔案中的jar包依賴
<dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.4</version> </dependency> <dependency> <groupId>net.sourceforge.htmlcleaner</groupId> <artifactId>htmlcleaner</artifactId> <version>2.16</version> </dependency> <dependency> <groupId>org.json</groupId> <artifactId>json</artifactId> <version>20160212</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency> <dependency> <groupId>commons-dbutils</groupId> <artifactId>commons-dbutils</artifactId> <version>1.6</version> </dependency> </dependencies>
編寫實體類
import java.util.HashMap; import java.util.Map; /** * 頁面實體類 * 儲存頁面資訊 */ public class Page { private String goodId;// 商品ID private String goodName;//商品名稱 private String dataUrl;//商品URL地址 private String picUrl;//商品圖片URL地址 private String price;//價格 private Map<String, String> param = new HashMap<String, String>();//商品引數規格 private String content;//頁面原始原始碼內容 public String getGoodId() { return goodId; } public void setGoodId(String goodId) { this.goodId = goodId; } public String getGoodName() { return goodName; } public void setGoodName(String goodName) { this.goodName = goodName; } public String getDataUrl() { return dataUrl; } public void setDataUrl(String dataUrl) { this.dataUrl = dataUrl; } public Map<String, String> getParam() { return param; } public void setParam(String key,String value) { this.param.put(key, value); } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getPicUrl() { return picUrl; } public void setPicUrl(String picUrl) { this.picUrl = picUrl; } public String getPrice() { return price; } public void setPrice(String price) { this.price = price; } }
spider類
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.Downloadable;
import cn.crxy.maven.Spider.process.Processable;
import cn.crxy.maven.Spider.store.Storeable;
public class Spider {
private Downloadable downloadable;
private Processable processable;
private Storeable storeable;
//下載頁面原始碼
public Page download(String url){
return downloadable.download(url);
}
//解析頁面原始碼
public void process(Page page){
processable.process(page);
}
//將解析後的資料儲存到資料庫
public void store(Page page){
storeable.store(page);
}
public Downloadable getDownloadable() {
return downloadable;
}
public void setDownloadable(Downloadable downloadable) {
this.downloadable = downloadable;
}
public Processable getProcessable() {
return processable;
}
public void setProcessable(Processable processable) {
this.processable = processable;
}
public Storeable getStoreable() {
return storeable;
}
public void setStoreable(Storeable storeable) {
this.storeable = storeable;
}
}
Downloadable介面類
import cn.crxy.maven.Spider.domain.Page;
public interface Downloadable {
Page download(String url);
}
DownloadImpl實現類
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.PageUtil;
public class DownloadImpl implements Downloadable {
public Page download(String url) {
Page page = new Page();
String content=PageUtil.getContent(url);//根據url得到內容
page.setContent(content);
page.setDataUrl(url);
return page;
}
}
PageUtil頁面工具類
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
/**
* 根據URL獲取url對應的內容
*/
public class PageUtil {
public static String getContent(String url){
HttpClientBuilder custom = HttpClients.custom();//建立httpclient
//通過構建器構建一個httpclient物件,可以認為是獲取到一個瀏覽器物件
CloseableHttpClient build = custom.build();
//把url封裝到get請求中
HttpGet httpGet = new HttpGet(url);
String content = null;
try {
//使用client執行get請求,獲取請求的結果,請求的結果被封裝到response中
CloseableHttpResponse response = build.execute(httpGet);
//表示獲取返回的內容實體物件
HttpEntity entity = response.getEntity();
//解析實體中頁面的內容,返回字串形式
content = EntityUtils.toString(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
}
Processable.java
import cn.crxy.maven.Spider.domain.Page;
public interface Processable {
void process(Page page);
}
ProcessImpl.java
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.json.JSONArray;
import org.json.JSONObject;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.HtmlUtil;
import cn.crxy.maven.Spider.utils.PageUtil;
public class ProcessImpl implements Processable {
public void process(Page page) {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode = htmlCleaner.clean(page.getContent());
try {
String goodName = HtmlUtil.getText(rootNode, "//*[@id='name']/h1");// 得到商品名稱
page.setGoodName(goodName);
String picUrl = HtmlUtil.getAttributeByName(rootNode, "//*[@id='spec-n1']/img","src");// 獲取商品圖片url
page.setPicUrl("http:"+picUrl);
// 獲取商品號
String url = page.getDataUrl();
Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
Matcher matcher = compile.matcher(url);
String goodid = null;
if (matcher.find()) {
goodid = matcher.group(1);
page.setGoodId(goodid);
}
// 獲取商品價格
// 得到價格的json格式[{"id":"J_1593512","p":"17988.00","m":"17989.00"}]
String pricejson = PageUtil
.getContent("http://p.3.cn/prices/get?skuid=J_" + goodid);
JSONArray jsonArray = new JSONArray(pricejson);
JSONObject jsonObject = jsonArray.getJSONObject(0);
String price = jsonObject.getString("p");
page.setPrice(price);
// 獲取規格引數
// *[@id="product-detail-2"]
// *[@id="product-detail-2"]/table/tbody/tr[1]/th
Object[] evaluateXPath = rootNode
.evaluateXPath("//*[@id='product-detail-2']/table/tbody/tr");
JSONArray jsonArray2 = new JSONArray();
if(evaluateXPath != null && evaluateXPath.length > 0){
for(Object object : evaluateXPath){
TagNode tagnode = (TagNode) object;
if(!"".equals(tagnode.getText().toString().trim())){//有資料
Object[] evaluateXPath2 = tagnode.evaluateXPath("/th");
JSONObject jsonObject2 = new JSONObject();
if(evaluateXPath2.length>0){
TagNode tagNode2 = (TagNode) evaluateXPath2[0];
jsonObject2.put("name", tagNode2.getText().toString());
jsonObject2.put("value", "");
}else {
Object[] evaluateXPath3 = tagnode.evaluateXPath("/td");
TagNode tagNode1 = (TagNode) evaluateXPath3[0];
TagNode tagNode2 = (TagNode) evaluateXPath3[1];
jsonObject2.put("name", tagNode1.getText().toString());
jsonObject2.put("value", tagNode2.getText().toString());
}
jsonArray2.put(jsonObject2);
}
}
}
page.setParam("spec",jsonArray2.toString());
} catch (XPatherException e) {
e.printStackTrace();
}
}
}
ProcessImpl.java程式碼中的幾個注意點:
獲取商品名稱、圖片URL的xpath路徑
在京東商品頁面獲取商品價格的方式
得到如下的連線地址:
http://p.3.cn/prices/get?type=1&area=1_72_4137&pdtk=&pduid=1112434089&pdpin=&pdbp=0&skuid=J_1593512&callback=cnp
對連線進行處理後得到如下結果商品引數規格的Xpath
Storeable.java
package cn.crxy.maven.Spider.store;
import cn.crxy.maven.Spider.domain.Page;
public interface Storeable {
void store(Page page);
}
StoreImple.java
package cn.crxy.maven.Spider.store;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.MyDBUtils;
public class StoreImpl implements Storeable {
public void store(Page page) {
String dataUrl = page.getDataUrl();
String goodid = page.getGoodId();
String goodname = page.getGoodName();
String picUrl = page.getPicUrl();
String price = page.getPrice();
Map<String, String> values = page.getParam();
String param = values.get("spec");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String currtime = sdf.format(new Date());
MyDBUtils.update(MyDBUtils.INSERT_LOG, goodid,dataUrl,picUrl,goodname,price,param,currtime);
}
}
MyDBUtils.java
package cn.crxy.maven.Spider.utils;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;
public class MyDBUtils {
private static String className = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://localhost:3306/spider?"
+ "useUnicode=true&characterEncoding=utf-8";
private static String user = "root";
private static String password = "1234";
private static QueryRunner queryRunner = new QueryRunner();
public static final String INSERT_LOG = "INSERT INTO SPIDER(good_id,"
+ "data_url,pic_url,good_name,price,param,`current_time`) "
+ "VALUES(?,?,?,?,?,?,?)";
// 拒絕new一個例項
private MyDBUtils() {
};
static {// 呼叫該類時既註冊驅動
try {
Class.forName(className);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException();
}
}
//查詢
public static List<String> executeQuerySql(String sql) {
List<String> result = new ArrayList<String>();
try {
List<Object[]> requstList = queryRunner.query(getConnection(), sql,
new ArrayListHandler(new BasicRowProcessor() {
@Override
public <Object> List<Object> toBeanList(ResultSet rs,
Class<Object> type) throws SQLException {
return super.toBeanList(rs, type);
}
}));
for (Object[] objects : requstList) {
result.add(objects[0].toString());
}
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
//這個方法可以執行一些更新或者新增的sql語句或者刪除
public static void update(String sql, Object... params) {
try {
Connection connection = getConnection();
queryRunner.update(connection, sql, params);
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
// 獲取連線
private static Connection getConnection() throws SQLException {
return DriverManager.getConnection(url, user, password);
}
}
HtmlUtils.java
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class HtmlUtils {
/**
* 根據xpath獲取對應標籤的內容
* @param tagNode
* @param xpath
* @return
*/
public static String getText(TagNode tagNode,String xpath){
String content = null;
Object[] evaluateXPath;
try {
evaluateXPath = tagNode.evaluateXPath(xpath);
if(evaluateXPath!=null && evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
content = node.getText().toString();
}
} catch (XPatherException e) {
e.printStackTrace();
}
return content;
}
/**
* 獲取對應標籤中指定屬性的值
* @param tagNode
* @param xpath
* @param attr
* @return
*/
public static String getAttributeByName(TagNode tagNode,String xpath,String attr){
String content = null;
Object[] evaluateXPath;
try {
evaluateXPath = tagNode.evaluateXPath(xpath);
if(evaluateXPath!=null && evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
content = node.getAttributeByName(attr);
}
} catch (XPatherException e) {
e.printStackTrace();
}
return content;
}
}
在src/test/java資料夾下面的包中新建test類 TestSpider.java
package cn.crxy.maven.Spider;
import org.junit.Test;
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.DownloadImpl;
import cn.crxy.maven.Spider.process.ProcessImpl;
import cn.crxy.maven.Spider.store.StoreImpl;
public class TestSpider {
@Test
public void test1() throws Exception {
Spider spider = new Spider();
//給介面注入實現類
spider.setDownloadable(new DownloadImpl());
spider.setProcessable(new ProcessImpl());
spider.setStoreable(new StoreImpl());
String url = "http://item.jd.com/1593512.html";
Page page = spider.download(url);
spider.process(page);
spider.store(page);
}
}
執行test測試方法,在資料庫中插入了資料