(11)Java爬蟲框架webmagic實戰
阿新 • • 發佈:2018-12-16
Java爬蟲框架webmagic實戰
本文是我關於webmagic爬蟲框架的實戰——爬取古詩詞網站的詩詞資料。此程式碼只用於爬蟲學習,勿用於商業用途。
安裝webmagic
webmagic使用maven管理依賴,在專案中新增對應的依賴即可使用webmagic:
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
定義Article類儲存詩詞資料
package edu.nlp.model; public class Article { private int articleId; /** * 型別 **/ private String type; /** * 作者 **/ private String author; /** * 朝代 **/ private String dynasty; /** * 作者簡介 **/ private String authorInfo; /** * 標題 **/ private String title; /** * 原文 **/ private String content; /** * 譯文 **/ private String translation; /** * 註釋 **/ private String comment; /** * 賞析 **/ private String appreciation; /** * UUID **/ private String id; /** * 匹配度 **/ private float score; public int getArticleId() { return articleId; } public void setArticleId(int articleId) { this.articleId = articleId; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getDynasty() { return dynasty; } public void setDynasty(String dynasty) { this.dynasty = dynasty; } public String getAuthorInfo() { return authorInfo; } public void setAuthorInfo(String authorInfo) { this.authorInfo = authorInfo; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getTranslation() { return translation; } public void setTranslation(String translation) { this.translation = translation; } public String getComment() { return comment; } public void setComment(String comment) { this.comment = comment; } public String getAppreciation() { return appreciation; } public void setAppreciation(String appreciation) { this.appreciation = appreciation; } public String toString() { return "Article:{id=" + id + ",score=" + score + ",type=" + type + ",dynasty=" + dynasty + ",author=" + author + ",authorInfo=" + authorInfo + ",title=" + title + ",content=" + content + ",translation=" + translation + ",comment=" + comment + ",appreciation=" + appreciation + "}"; } public String getId() { return id; } public void setId(String id) { this.id = id; } public float getScore() { return score; } public void setScore(float score) { this.score = score; } }
爬取中國詩詞網的資料
以各個朝代為初始連結,爬取中國詩詞網中每條詩詞的所屬朝代、作者資訊、原文、翻譯、賞析,儲存每條詩詞資料為json文字。
package edu.nlp.processer; import java.util.HashMap; import java.util.List; import java.util.Map; import edu.nlp.model.Article; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; public class ShiWenPageProcessor implements PageProcessor { /** * 匹配朝代 **/ private final static String PATTER_DYNASTY = "(xianqin|hanchao|weijin|nanbeichao|suichao|tangshi|wudai|" + "songci|jinchao|yuanchao|mingchao|qingchao)"; /** * 朝代連結 **/ private final static String URL_DYNASTY = "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/$"; /** * 作者連結 **/ private final static String URL_AUTHOR = "http://www\\.shici\\.net/shiren/[a-z]{5}\\.html"; /** * 詩詞連結 **/ private final static String URL_ARTICLE = "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/[a-z]{5}\\.html"; /** * 翻譯連結 **/ private final static String URL_TRANSLATION = "/fanyi/[a-z]{5}\\.html";//http://www\\.shici\\.net /** * 賞析連結 **/ private final static String URL_APPRECIATION = "/shangxi/[a-z]{5}\\.html"; /** * 文章Map,暫存Article **/ private static Map<String, Article> articleMap = new HashMap<String, Article>(); /** * 儲存Article **/ private void saveArticle(Article article, Page page) { System.out.println("詩歌:" + article); page.putField("dynasty", article.getDynasty()); page.putField("author", article.getAuthor()); page.putField("authorInfo", article.getAuthorInfo()); page.putField("title", article.getTitle()); page.putField("content", article.getContent()); page.putField("translation", article.getTranslation()); page.putField("comment", article.getComment()); page.putField("appreciation", article.getAppreciation()); } private Site site = Site.me().setCycleRetryTimes(5) .setRetryTimes(5).setSleepTime(1000) .setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0") .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") .setCharset("UTF-8"); public void process(Page page) { if (page.getUrl().regex(URL_DYNASTY).match()) { //System.out.println("朝代:"+page.getUrl()); //作者列表 List<String> authorUrl = page.getHtml() .xpath("//div[@class='shirenlist']") .links().all(); page.addTargetRequests(authorUrl); //古詩文列表 List<String> essayUrl = page.getHtml() .xpath("//div[@id='related']/ul") .links().all(); page.addTargetRequests(essayUrl); page.setSkip(true);//跳過這個頁面 } else if (page.getUrl().regex(URL_AUTHOR).match()) { //System.out.println("作者:"+page.getUrl()); //詩詞列表 List<String> poemUrl = page.getHtml() .xpath("//div[@id='related']/ul/li/a/@href") .all(); //System.out.println(poemUrl); page.addTargetRequests(poemUrl); page.setSkip(true);//跳過這個頁面 } else if (page.getUrl().regex(URL_ARTICLE).match()) { //System.out.println("詩詞:"+page.getUrl()); Html html = page.getHtml(); Article article = new Article(); //朝代 String dynasty = html .xpath("//div[@id='article']/div[@class='info']") .regex("<span>朝代:</span>(.*?)</p>").toString(); //System.out.println(dynasty); article.setDynasty(dynasty); //作者 String author = html .xpath("//div[@id='article']/div[@class='info']") .regex("<span>作者:</span><.*>(.*?)</a>").toString(); //System.out.println(author); article.setAuthor(author); if (!author.equals("佚名")) { //作者簡介 String authorInfo = html .xpath("//div[@class='authorinfo']") .regex("<br>(.*)</div>").toString(); //System.out.println(authorInfo); article.setAuthorInfo(authorInfo); } //標題 String title = html.xpath("div[@id='article']/h1/text()") .toString(); //System.out.println(title); article.setTitle(title); //原文 String content = html .xpath("div[@id='article']/div[@class='content']") .regex("<div class=\"content\">(.*)</div>") .toString(); //System.out.println(content); article.setContent(content); //譯文連結 String translationUrl = html .xpath("div[@id='related']/ul/li/h3/a/@href") .regex(URL_TRANSLATION) .toString(); //賞析連結 String appreciateUrl = html .xpath("div[@id='related']/ul/li/h3/a/@href") .regex(URL_APPRECIATION) .toString(); //System.out.println("翻譯:" + translationUrl); //System.out.println("賞析:" + appreciateUrl); if (translationUrl == null && appreciateUrl == null) { saveArticle(article, page); } else { if (translationUrl != null) { article.setTranslation("http://www.shici.net" + translationUrl); page.addTargetRequest("http://www.shici.net" + translationUrl); } if (appreciateUrl != null) { article.setAppreciation("http://www.shici.net" + appreciateUrl); page.addTargetRequest("http://www.shici.net" + appreciateUrl); } articleMap.put(page.getUrl().toString(), article); page.setSkip(true);//跳過這個頁面 } } else if (page.getUrl().regex(URL_TRANSLATION).match()) { Html html = page.getHtml(); String articleUrl = "http://www.shici.net" + html .xpath("//div[@class='relatedshici']/h2/a/@href") .toString(); System.out.println(articleUrl); String title = html.xpath("//div[@id='article']/h1/text()").toString(); String translation = null; String comment = null; //處理譯文與註釋 if (title.endsWith("譯文及註釋")) { translation = html .xpath("//div[@id='article']/div[@class='content']") .regex("<p><strong>譯文</strong><br>(.*?)</p>") .toString(); comment = html .xpath("//div[@id='article']/div[@class='content']") .regex("<p><strong>註釋</strong><br>(.*?)</p>") .toString(); } else { if (title.endsWith("譯文")) { translation = html .xpath("//div[@id='article']") .regex("<div class=\"content\">(.*?)</div>") .toString(); } if (title.endsWith("註釋")) { comment = html .xpath("//div[@id='article']") .regex("<div class=\"content\">(.*?)</div>") .toString(); } } System.out.println("註釋:" + comment); System.out.println("翻譯:" + translation); Article article = articleMap.get(articleUrl); article.setTranslation(translation); article.setComment(comment); String appreciation = article.getAppreciation(); if (appreciation != null && appreciation.startsWith("http")) { page.setSkip(true);//跳過這個頁面 } else { saveArticle(article, page); articleMap.remove(articleUrl); } } else if (page.getUrl().regex(URL_APPRECIATION).match()) { Html html = page.getHtml(); String articleUrl = "http://www.shici.net" + html .xpath("//div[@class='relatedshici']/h2/a/@href") .toString(); System.out.println(articleUrl); String title = html.xpath("//div[@id='article']/h1").toString(); String appreciation = html .xpath("//div[@id='article']") .regex("<div class=\"content\">(.*?)</div>") .toString(); System.out.println("賞析:" + title + appreciation); Article article = articleMap.get(articleUrl); article.setAppreciation(title + appreciation); String translation = article.getTranslation(); if (translation != null && translation.startsWith("http")) { page.setSkip(true);//跳過這個頁面 } else { saveArticle(article, page); articleMap.remove(articleUrl); } } } public Site getSite() { return site; } private final static String[] intiUrls = { "http://www.shici.net/xianqin/", "http://www.shici.net/hanchao/", "http://www.shici.net/weijin/", "http://www.shici.net/nanbeichao/", "http://www.shici.net/suichao/", "http://www.shici.net/tangshi/", "http://www.shici.net/wudai/", "http://www.shici.net/songci/", "http://www.shici.net/jinchao/", "http://www.shici.net/yuanchao/", "http://www.shici.net/mingchao/", "http://www.shici.net/qingchao/", }; public static void main(String[] args) { Spider.create(new ShiWenPageProcessor()) // .addUrl("http://www.shici.net/xianqin/") .addUrl(intiUrls) .addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data")) .thread(5) .run(); System.out.println("執行結束"); } }
執行程式後,生成的資料如下:
爬取好詩文網的資料
以各個朝代下各種詩文型別為初始連結(總共55個連結),爬取好詩文網中每條詩文的所屬朝代、作者資訊、原文、翻譯、賞析,儲存每條詩文資料為json文字。
package edu.nlp.processer;
import edu.nlp.model.Article;
import edu.nlp.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
//import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class HaoShiWenPageProcessor implements PageProcessor {
/**
* 開始連結
**/
private final static String URL_START = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]$";
/**
* 列表連結
**/
private final static String URL_LIST = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]&page=\\d+";
/**
* 詩詞連結
**/
private final static String URL_ARTICLE = "/view\\.php\\?id=\\d+";
/**
* 翻譯連結
**/
private final static String URL_TRANSLATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=2&id=\\d+";
/**
* 賞析連結
**/
private final static String URL_APPRECIATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=1&id=\\d+";
/**
* 暫存Article
**/
private static Map<String, Article> articleMap =
new HashMap<String, Article>();
/**
* 暫存article的型別
**/
private static Map<String, String> articleType =
new HashMap<String, String>();
/**
* 初始化開始爬取的連結
**/
private static String[] intiUrls() {
String[] urls = new String[55];
int count = 0;
for (int i = 1; i <= 11; i++) {
for (int j = 1; j <= 5; j++) {
urls[count++] = "http://www.haoshiwen.org/type.php?c=" + i + "&x=" + j;
}
}
return urls;
}
/**
* 獲取article的型別
*
* @param url 開始連結或列表連結,從中提取出article型別Num
* @return
*/
private static String getType(Selectable url) {
String type = null;
int typeNum = Integer.parseInt(url.regex("c=\\d+&x=([1-5)])").toString());
switch (typeNum) {
case 1:
type = "詩";
break;
case 2:
type = "詞";
break;
case 3:
type = "曲";
break;
case 4:
type = "文言文";
break;
case 5:
type = "辭賦";
break;
}
return type;
}
/**
* 儲存Article
**/
private void saveArticle(Article article, Page page) {
//System.out.println("詩歌:" + article);
page.putField("articleId", article.getArticleId());
page.putField("type", article.getType());
page.putField("dynasty", article.getDynasty());
page.putField("author", article.getAuthor());
page.putField("authorInfo", article.getAuthorInfo());
page.putField("title", article.getTitle());
page.putField("content", article.getContent());
page.putField("translation", article.getTranslation());
page.putField("comment", article.getComment());
page.putField("appreciation", article.getAppreciation());
}
/**
* 配置Site
**/
private Site site = Site.me()
.setCycleRetryTimes(3)// 設定迴圈重試次數
.setRetryTimes(3)// 設定重試次數
.setSleepTime(100)// 設定處理page的間隔時間,單位毫秒
.setTimeOut(3000)// 設定訪問url的超時時間,單位毫秒
// 設定 userAgent
.setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
// 設定 header資訊
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
.setCharset("UTF-8");// 設定編碼
public Site getSite() {
return site;
}
/**
* 提取想要的資訊
**/
public void process(Page page) {
if (page.getUrl().regex(URL_START).match()) {
//獲取頁數OK
String pageStr = page.getHtml()
.xpath("//div[@class='pages']")
.regex("/type.php\\?c=\\d+&x=[1-5]&page=(\\d+)\">尾頁</a>")
.toString();
//System.out.println("開始:" + page.getUrl() + " 頁數" + pageStr);
if (pageStr != null) {
int pageNum = Integer.parseInt(pageStr);
//System.out.println(pageNum);
List<String> pageUrl = new ArrayList<String>();
//把其餘頁的url新增到Request佇列中
for (int i = 2; i <= pageNum; i++) {
pageUrl.add(page.getUrl() + "&page=" + i);
}
page.addTargetRequests(pageUrl);
}
//新增起始頁的古詩文列表
List<String> articleUrl = page.getHtml()
.xpath("//div[@class='typeleft']/div[@class='sons']")
.regex(URL_ARTICLE)
.all();
//System.out.println("詩歌列表:" + articleUrl);
page.addTargetRequests(articleUrl);
page.setSkip(true);//跳過這個頁面
//獲取型別
String type = getType(page.getUrl());
for (String url : articleUrl) {
//System.out.println("詩歌連結:" + url + ", " + type);
articleType.put(url, type);
}
}
if (page.getUrl().regex(URL_LIST).match()) {
//System.out.println("列表:" + page.getUrl());
//古詩文列表
List<String> articleUrl = page.getHtml()
.xpath("//div[@class='typeleft']/div[@class='sons']")
.regex(URL_ARTICLE)
.all();
//System.out.println("詩歌列表:" + articleUrl);
page.addTargetRequests(articleUrl);
page.setSkip(true);//跳過這個頁面
//獲取型別
String type = getType(page.getUrl());
for (String url : articleUrl) {
articleType.put(url, type);
}
} else if (page.getUrl().regex(URL_ARTICLE).match()) {
System.out.println("詩詞:" + page.getUrl());
// remove the prefix of article url
String articleUrl = page.getUrl().toString().replace("http://www.haoshiwen.org", "");
Html html = page.getHtml();
Article article = new Article();
//詩歌ID
article.setArticleId(Integer.parseInt(articleUrl.replace("/view.php?id=", "")));
//型別
article.setType(articleType.get(articleUrl));
//朝代
String dynasty = html
.xpath("//div[@class='son2']")
.regex("<span>朝代:</span>(.*?)</p>").toString();
//System.out.println("朝代:" + dynasty);
article.setDynasty(dynasty);
//作者
String author = html
.xpath("//div[@class='son2']")
.regex("<span>作者:</span>(.*?)</p>")
.toString().replaceAll("</?a.*?>", "");
//System.out.println("作者" + author);
article.setAuthor(author);
if (!author.equals("佚名")) {
//作者簡介
String authorInfo = html
.regex("<div class=\"son5\" style=\"overflow:auto;\">" +
".*<img.*></a>(.*)<a.*?>\\.▶</a>")
.toString();
//System.out.println("作者簡介:"+authorInfo);
if (authorInfo != "0") {
article.setAuthorInfo(authorInfo);
}
}
//標題
String title = html.xpath("div[@class='son1']/h1/text()")
.toString();
//System.out.println("標題:"+title);
article.setTitle(title);
//原文
String content = html
.xpath("//div[@class='shileft']/div[@class='son2']")
.regex("<p style=\"margin\\-top:0px;\">\\ </p>\\s+(.*?)<br>\\s+" +
"<strong><span style=\"color:#FFFFFF;background-color:#E53333;\">精彩推薦</span></strong>")
.toString();
//System.out.println("原文:" + content);
article.setContent(content);
//譯文連結
String translateUrl = html
.xpath("div[@class='son5']").links()
.regex(URL_TRANSLATION).toString();
//賞析連結
String appreciationUrl = html
.xpath("div[@class='son5']").links()
.regex(URL_APPRECIATION).toString();
//System.out.println("翻譯:" + translateUrl);
//System.out.println("賞析:" + appreciationUrl);
if (translateUrl == null && appreciationUrl == null) {
//如果沒有譯文和賞析,則直接儲存該Article物件
saveArticle(article, page);
} else {
//否則,則把Article存在articleMap,等待資訊被補齊才儲存
if (translateUrl != null) {
article.setTranslation(translateUrl);
page.addTargetRequest(translateUrl);
}
if (appreciationUrl != null) {
article.setAppreciation(appreciationUrl);
page.addTargetRequest(appreciationUrl);
}
articleMap.put(articleUrl, article);
page.setSkip(true);//跳過這個頁面
}
} else if (page.getUrl().regex(URL_TRANSLATION).match()) {
Html html = page.getHtml();
String articleUrl = html
.xpath("//div[@class='sontitle']/span/a/@href")
.toString();
//System.out.println("詩詞的連結"+articleUrl);
//翻譯標題
String translationTitle = html
.xpath("//div[@class='shileft']/div[@class='son1']/h1/text()")
.toString();
//System.out.println("翻譯標題:" + translationTitle);
String translation = null;
String comment = null;
if (translationTitle.endsWith("譯文及註釋")) {
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p><strong>譯文.*?</strong>(.*?)</p>")
.toString();
if (translation != null)//去掉無關內容
translation = translation.replaceAll("</?a.*?>", "");
comment = html
.xpath("//div[@class='shangxicont']")
.regex("<p><strong>註釋.*?</strong>(.*?)</p>")
.toString();
if (comment != null)
comment = comment.replaceAll("</?a.*?>", "");
if (translation == null && comment == null) {
//譯文和註釋被合併在了一起
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (translation != null)
translation = translation.replaceAll("</?a.*?>", "");
}
} else {
//只有譯文
if (translationTitle.endsWith("譯文")) {
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (translation != null)//去掉無關內容
translation = translation.replaceAll("</?a.*?>", "");
}
//只有註釋
if (translationTitle.endsWith("註釋")) {
comment = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (comment != null) {
comment = comment.replaceAll("</?a.*?>", "");
}
}
}
//System.out.println("翻譯:" + translation);
//System.out.println("註釋:" + comment);
Article article = articleMap.get(articleUrl);
article.setTranslation(translation);
article.setComment(comment);
String appreciation = article.getAppreciation();
if (appreciation != null && appreciation.startsWith("http")) {
page.setSkip(true);//跳過這個頁面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
} else if (page.getUrl().regex(URL_APPRECIATION).match()) {
Html html = page.getHtml();
String articleUrl = html
.xpath("//div[@class='sontitle']/span/a/@href")
.toString();
//System.out.println("詩詞的連結" + articleUrl);
String appreciateTitle = html
.xpath("//div[@class='shileft']/div[@class='son1']/h1")
.toString();
//System.out.println(appreciateTitle);
String appreciation = html
.xpath("//div[@class='shangxicont']")
.regex("<p.*>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (appreciation != null)
appreciation = appreciation.replaceAll("</?a.*?>", "");
//System.out.println("賞析:" + appreciation);
Article article = articleMap.get(articleUrl);
article.setAppreciation(appreciateTitle + appreciation);
String translation = article.getTranslation();
if (translation != null && translation.startsWith("http")) {
page.setSkip(true);//跳過這個頁面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
}
}
/**
* 獲取article總數
**/
public static int articleCount() {
return articleType.size();
}
public static void main(String[] args) {
HaoShiWenPageProcessor processor = new HaoShiWenPageProcessor();
Spider.create(processor)//指定PageProcessor頁面處理器
.addUrl(intiUrls())//新增爬取連結
//.addUrl("http://www.haoshiwen.org/view.php?id=47834")
//指定Pipeline結果處理物件,這裡把結果儲存成JSON檔案
// 預設儲存到/data/webmagic,這裡儲存資料到/data下
// 使用自定義的jsonFilePipeline來儲存json資料,以詩歌ID來命名json檔案
// 預設的jsonFilePipeline是以連結的Url經過md5加密後來命名json檔案的,如果重複下載的話,會出現重複的檔案
.addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
.thread(5)//指定執行緒數
.run();//開始爬蟲
System.out.println("詩詞總數有:" + processor.articleCount());//75604
System.out.println("執行結束");
}
}
自定義JsonFilePipe儲存json資料
由於使用webmagic預設的JsonFilePipe生成的json檔案的檔名是使用MD5對檔案進行命名的,生成的json檔案無法從檔名上和網站的詩文連結進行一一對應,所以這裡自定義JsonFilePipe,設定儲存的json檔名為詩文的ID,方便查詢原始的詩文內容。
package edu.nlp.pipeline;
import com.alibaba.fastjson.JSON;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(this.getClass());
public JsonFilePipeline() {
this.setPath("/data/webmagic");
}
public JsonFilePipeline(String path) {
this.setPath(path);
}
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
// 用articleId來命名檔名
PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + resultItems.get("articleId") + ".json")));
//PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (IOException var5) {
this.logger.warn("write file error", var5);
}
}
}
執行程式後,生成的資料如下: