1. 程式人生 > >(11)Java爬蟲框架webmagic實戰

(11)Java爬蟲框架webmagic實戰

Java爬蟲框架webmagic實戰

本文是我關於webmagic爬蟲框架的實戰——爬取古詩詞網站的詩詞資料。此程式碼只用於爬蟲學習,勿用於商業用途。

安裝webmagic

webmagic使用maven管理依賴,在專案中新增對應的依賴即可使用webmagic:

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
</dependency>

定義Article類儲存詩詞資料

package edu.nlp.model;

public class Article {

    private int articleId;
    /**
     * 型別
     **/
    private String type;
    /**
     * 作者
     **/
    private String author;
    /**
     * 朝代
     **/
    private String dynasty;
    /**
     * 作者簡介
     **/
    private String authorInfo;
    /**
     * 標題
     **/
    private String title;
    /**
     * 原文
     **/
    private String content;
    /**
     * 譯文
     **/
    private String translation;
    /**
     * 註釋
     **/
    private String comment;
    /**
     * 賞析
     **/
    private String appreciation;
    /**
     * UUID
     **/
    private String id;
    /**
     * 匹配度
     **/
    private float score;

    public int getArticleId() {
        return articleId;
    }

    public void setArticleId(int articleId) {
        this.articleId = articleId;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getDynasty() {
        return dynasty;
    }

    public void setDynasty(String dynasty) {
        this.dynasty = dynasty;
    }

    public String getAuthorInfo() {
        return authorInfo;
    }

    public void setAuthorInfo(String authorInfo) {
        this.authorInfo = authorInfo;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getTranslation() {
        return translation;
    }

    public void setTranslation(String translation) {
        this.translation = translation;
    }

    public String getComment() {
        return comment;
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    public String getAppreciation() {
        return appreciation;
    }

    public void setAppreciation(String appreciation) {
        this.appreciation = appreciation;
    }

    public String toString() {
        return "Article:{id=" + id + ",score=" + score + ",type=" + type
                + ",dynasty=" + dynasty + ",author=" + author
                + ",authorInfo=" + authorInfo + ",title=" + title + ",content="
                + content + ",translation=" + translation + ",comment=" + comment
                + ",appreciation=" + appreciation + "}";
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public float getScore() {
        return score;
    }

    public void setScore(float score) {
        this.score = score;
    }

}

爬取中國詩詞網的資料

以各個朝代為初始連結,爬取中國詩詞網中每條詩詞的所屬朝代、作者資訊、原文、翻譯、賞析,儲存每條詩詞資料為json文字。

package edu.nlp.processer;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import edu.nlp.model.Article;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class ShiWenPageProcessor implements PageProcessor {

    /**
     * 匹配朝代
     **/
    private final static String PATTER_DYNASTY =
            "(xianqin|hanchao|weijin|nanbeichao|suichao|tangshi|wudai|"
                    + "songci|jinchao|yuanchao|mingchao|qingchao)";
    /**
     * 朝代連結
     **/
    private final static String URL_DYNASTY =
            "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/$";
    /**
     * 作者連結
     **/
    private final static String URL_AUTHOR =
            "http://www\\.shici\\.net/shiren/[a-z]{5}\\.html";
    /**
     * 詩詞連結
     **/
    private final static String URL_ARTICLE =
            "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/[a-z]{5}\\.html";
    /**
     * 翻譯連結
     **/
    private final static String URL_TRANSLATION =
            "/fanyi/[a-z]{5}\\.html";//http://www\\.shici\\.net
    /**
     * 賞析連結
     **/
    private final static String URL_APPRECIATION =
            "/shangxi/[a-z]{5}\\.html";
    /**
     * 文章Map,暫存Article
     **/
    private static Map<String, Article> articleMap =
            new HashMap<String, Article>();

    /**
     * 儲存Article
     **/
    private void saveArticle(Article article, Page page) {
        System.out.println("詩歌:" + article);
        page.putField("dynasty", article.getDynasty());
        page.putField("author", article.getAuthor());
        page.putField("authorInfo", article.getAuthorInfo());
        page.putField("title", article.getTitle());
        page.putField("content", article.getContent());
        page.putField("translation", article.getTranslation());
        page.putField("comment", article.getComment());
        page.putField("appreciation", article.getAppreciation());
    }

    private Site site = Site.me().setCycleRetryTimes(5)
            .setRetryTimes(5).setSleepTime(1000)
            .setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
            .setCharset("UTF-8");


    public void process(Page page) {
        if (page.getUrl().regex(URL_DYNASTY).match()) {
            //System.out.println("朝代:"+page.getUrl());
            //作者列表
            List<String> authorUrl = page.getHtml()
                    .xpath("//div[@class='shirenlist']")
                    .links().all();
            page.addTargetRequests(authorUrl);
            //古詩文列表
            List<String> essayUrl = page.getHtml()
                    .xpath("//div[@id='related']/ul")
                    .links().all();
            page.addTargetRequests(essayUrl);
            page.setSkip(true);//跳過這個頁面
        } else if (page.getUrl().regex(URL_AUTHOR).match()) {
            //System.out.println("作者:"+page.getUrl());
            //詩詞列表
            List<String> poemUrl = page.getHtml()
                    .xpath("//div[@id='related']/ul/li/a/@href")
                    .all();
            //System.out.println(poemUrl);
            page.addTargetRequests(poemUrl);
            page.setSkip(true);//跳過這個頁面
        } else if (page.getUrl().regex(URL_ARTICLE).match()) {
            //System.out.println("詩詞:"+page.getUrl());
            Html html = page.getHtml();
            Article article = new Article();
            //朝代
            String dynasty = html
                    .xpath("//div[@id='article']/div[@class='info']")
                    .regex("<span>朝代:</span>(.*?)</p>").toString();
            //System.out.println(dynasty);
            article.setDynasty(dynasty);
            //作者
            String author = html
                    .xpath("//div[@id='article']/div[@class='info']")
                    .regex("<span>作者:</span><.*>(.*?)</a>").toString();
            //System.out.println(author);
            article.setAuthor(author);
            if (!author.equals("佚名")) {
                //作者簡介
                String authorInfo = html
                        .xpath("//div[@class='authorinfo']")
                        .regex("<br>(.*)</div>").toString();
                //System.out.println(authorInfo);
                article.setAuthorInfo(authorInfo);
            }
            //標題
            String title = html.xpath("div[@id='article']/h1/text()")
                    .toString();
            //System.out.println(title);
            article.setTitle(title);
            //原文
            String content = html
                    .xpath("div[@id='article']/div[@class='content']")
                    .regex("<div class=\"content\">(.*)</div>")
                    .toString();
            //System.out.println(content);
            article.setContent(content);
            //譯文連結
            String translationUrl = html
                    .xpath("div[@id='related']/ul/li/h3/a/@href")
                    .regex(URL_TRANSLATION)
                    .toString();
            //賞析連結
            String appreciateUrl = html
                    .xpath("div[@id='related']/ul/li/h3/a/@href")
                    .regex(URL_APPRECIATION)
                    .toString();
            //System.out.println("翻譯:" + translationUrl);
            //System.out.println("賞析:" + appreciateUrl);
            if (translationUrl == null && appreciateUrl == null) {
                saveArticle(article, page);
            } else {
                if (translationUrl != null) {
                    article.setTranslation("http://www.shici.net" + translationUrl);
                    page.addTargetRequest("http://www.shici.net" + translationUrl);
                }
                if (appreciateUrl != null) {
                    article.setAppreciation("http://www.shici.net" + appreciateUrl);
                    page.addTargetRequest("http://www.shici.net" + appreciateUrl);
                }
                articleMap.put(page.getUrl().toString(), article);
                page.setSkip(true);//跳過這個頁面
            }
        } else if (page.getUrl().regex(URL_TRANSLATION).match()) {
            Html html = page.getHtml();
            String articleUrl = "http://www.shici.net" + html
                    .xpath("//div[@class='relatedshici']/h2/a/@href")
                    .toString();
            System.out.println(articleUrl);
            String title = html.xpath("//div[@id='article']/h1/text()").toString();
            String translation = null;
            String comment = null;
            //處理譯文與註釋
            if (title.endsWith("譯文及註釋")) {
                translation = html
                        .xpath("//div[@id='article']/div[@class='content']")
                        .regex("<p><strong>譯文</strong><br>(.*?)</p>")
                        .toString();
                comment = html
                        .xpath("//div[@id='article']/div[@class='content']")
                        .regex("<p><strong>註釋</strong><br>(.*?)</p>")
                        .toString();
            } else {
                if (title.endsWith("譯文")) {
                    translation = html
                            .xpath("//div[@id='article']")
                            .regex("<div class=\"content\">(.*?)</div>")
                            .toString();
                }
                if (title.endsWith("註釋")) {
                    comment = html
                            .xpath("//div[@id='article']")
                            .regex("<div class=\"content\">(.*?)</div>")
                            .toString();
                }
            }
            System.out.println("註釋:" + comment);
            System.out.println("翻譯:" + translation);
            Article article = articleMap.get(articleUrl);
            article.setTranslation(translation);
            article.setComment(comment);
            String appreciation = article.getAppreciation();
            if (appreciation != null && appreciation.startsWith("http")) {
                page.setSkip(true);//跳過這個頁面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        } else if (page.getUrl().regex(URL_APPRECIATION).match()) {
            Html html = page.getHtml();
            String articleUrl = "http://www.shici.net" + html
                    .xpath("//div[@class='relatedshici']/h2/a/@href")
                    .toString();
            System.out.println(articleUrl);
            String title = html.xpath("//div[@id='article']/h1").toString();
            String appreciation = html
                    .xpath("//div[@id='article']")
                    .regex("<div class=\"content\">(.*?)</div>")
                    .toString();
            System.out.println("賞析:" + title + appreciation);
            Article article = articleMap.get(articleUrl);
            article.setAppreciation(title + appreciation);
            String translation = article.getTranslation();
            if (translation != null && translation.startsWith("http")) {
                page.setSkip(true);//跳過這個頁面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        }
    }

    public Site getSite() {
        return site;
    }

    private final static String[] intiUrls = {
            "http://www.shici.net/xianqin/",
            "http://www.shici.net/hanchao/",
            "http://www.shici.net/weijin/",
            "http://www.shici.net/nanbeichao/",
            "http://www.shici.net/suichao/",
            "http://www.shici.net/tangshi/",
            "http://www.shici.net/wudai/",
            "http://www.shici.net/songci/",
            "http://www.shici.net/jinchao/",
            "http://www.shici.net/yuanchao/",
            "http://www.shici.net/mingchao/",
            "http://www.shici.net/qingchao/",
    };

    public static void main(String[] args) {
        Spider.create(new ShiWenPageProcessor())
//                .addUrl("http://www.shici.net/xianqin/")
                .addUrl(intiUrls)
                .addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
                .thread(5)
                .run();
        System.out.println("執行結束");
    }

}

執行程式後,生成的資料如下:

在這裡插入圖片描述

爬取好詩文網的資料

以各個朝代下各種詩文型別為初始連結(總共55個連結),爬取好詩文網中每條詩文的所屬朝代、作者資訊、原文、翻譯、賞析,儲存每條詩文資料為json文字。

package edu.nlp.processer;

import edu.nlp.model.Article;
import edu.nlp.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
//import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class HaoShiWenPageProcessor implements PageProcessor {

    /**
     * 開始連結
     **/
    private final static String URL_START = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]$";
    /**
     * 列表連結
     **/
    private final static String URL_LIST = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]&page=\\d+";
    /**
     * 詩詞連結
     **/
    private final static String URL_ARTICLE = "/view\\.php\\?id=\\d+";
    /**
     * 翻譯連結
     **/
    private final static String URL_TRANSLATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=2&id=\\d+";
    /**
     * 賞析連結
     **/
    private final static String URL_APPRECIATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=1&id=\\d+";

    /**
     * 暫存Article
     **/
    private static Map<String, Article> articleMap =
            new HashMap<String, Article>();
    /**
     * 暫存article的型別
     **/
    private static Map<String, String> articleType =
            new HashMap<String, String>();

    /**
     * 初始化開始爬取的連結
     **/
    private static String[] intiUrls() {
        String[] urls = new String[55];
        int count = 0;
        for (int i = 1; i <= 11; i++) {
            for (int j = 1; j <= 5; j++) {
                urls[count++] = "http://www.haoshiwen.org/type.php?c=" + i + "&x=" + j;
            }
        }
        return urls;
    }

    /**
     * 獲取article的型別
     *
     * @param url 開始連結或列表連結,從中提取出article型別Num
     * @return
     */
    private static String getType(Selectable url) {
        String type = null;
        int typeNum = Integer.parseInt(url.regex("c=\\d+&x=([1-5)])").toString());
        switch (typeNum) {
            case 1:
                type = "詩";
                break;
            case 2:
                type = "詞";
                break;
            case 3:
                type = "曲";
                break;
            case 4:
                type = "文言文";
                break;
            case 5:
                type = "辭賦";
                break;
        }
        return type;
    }

    /**
     * 儲存Article
     **/
    private void saveArticle(Article article, Page page) {
        //System.out.println("詩歌:" + article);
        page.putField("articleId", article.getArticleId());
        page.putField("type", article.getType());
        page.putField("dynasty", article.getDynasty());
        page.putField("author", article.getAuthor());
        page.putField("authorInfo", article.getAuthorInfo());
        page.putField("title", article.getTitle());
        page.putField("content", article.getContent());
        page.putField("translation", article.getTranslation());
        page.putField("comment", article.getComment());
        page.putField("appreciation", article.getAppreciation());
    }

    /**
     * 配置Site
     **/
    private Site site = Site.me()
            .setCycleRetryTimes(3)// 設定迴圈重試次數
            .setRetryTimes(3)// 設定重試次數
            .setSleepTime(100)// 設定處理page的間隔時間,單位毫秒
            .setTimeOut(3000)// 設定訪問url的超時時間,單位毫秒
            // 設定 userAgent
            .setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
            // 設定 header資訊
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
            .setCharset("UTF-8");// 設定編碼

    public Site getSite() {
        return site;
    }

    /**
     * 提取想要的資訊
     **/
    public void process(Page page) {
        if (page.getUrl().regex(URL_START).match()) {
            //獲取頁數OK
            String pageStr = page.getHtml()
                    .xpath("//div[@class='pages']")
                    .regex("/type.php\\?c=\\d+&amp;x=[1-5]&amp;page=(\\d+)\">尾頁</a>")
                    .toString();
            //System.out.println("開始:" + page.getUrl() + " 頁數" + pageStr);
            if (pageStr != null) {
                int pageNum = Integer.parseInt(pageStr);
                //System.out.println(pageNum);
                List<String> pageUrl = new ArrayList<String>();
                //把其餘頁的url新增到Request佇列中
                for (int i = 2; i <= pageNum; i++) {
                    pageUrl.add(page.getUrl() + "&page=" + i);
                }
                page.addTargetRequests(pageUrl);
            }
            //新增起始頁的古詩文列表
            List<String> articleUrl = page.getHtml()
                    .xpath("//div[@class='typeleft']/div[@class='sons']")
                    .regex(URL_ARTICLE)
                    .all();
            //System.out.println("詩歌列表:" + articleUrl);
            page.addTargetRequests(articleUrl);
            page.setSkip(true);//跳過這個頁面
            //獲取型別
            String type = getType(page.getUrl());
            for (String url : articleUrl) {
                //System.out.println("詩歌連結:" + url + ", " + type);
                articleType.put(url, type);
            }
        }
        if (page.getUrl().regex(URL_LIST).match()) {
            //System.out.println("列表:" + page.getUrl());
            //古詩文列表
            List<String> articleUrl = page.getHtml()
                    .xpath("//div[@class='typeleft']/div[@class='sons']")
                    .regex(URL_ARTICLE)
                    .all();
            //System.out.println("詩歌列表:" + articleUrl);
            page.addTargetRequests(articleUrl);
            page.setSkip(true);//跳過這個頁面
            //獲取型別
            String type = getType(page.getUrl());
            for (String url : articleUrl) {
                articleType.put(url, type);
            }
        } else if (page.getUrl().regex(URL_ARTICLE).match()) {
            System.out.println("詩詞:" + page.getUrl());
            // remove the prefix of article url
            String articleUrl = page.getUrl().toString().replace("http://www.haoshiwen.org", "");
            Html html = page.getHtml();
            Article article = new Article();
            //詩歌ID
            article.setArticleId(Integer.parseInt(articleUrl.replace("/view.php?id=", "")));
            //型別
            article.setType(articleType.get(articleUrl));
            //朝代
            String dynasty = html
                    .xpath("//div[@class='son2']")
                    .regex("<span>朝代:</span>(.*?)</p>").toString();
            //System.out.println("朝代:" + dynasty);
            article.setDynasty(dynasty);
            //作者
            String author = html
                    .xpath("//div[@class='son2']")
                    .regex("<span>作者:</span>(.*?)</p>")
                    .toString().replaceAll("</?a.*?>", "");
            //System.out.println("作者" + author);
            article.setAuthor(author);
            if (!author.equals("佚名")) {
                //作者簡介
                String authorInfo = html
                        .regex("<div class=\"son5\" style=\"overflow:auto;\">" +
                                ".*<img.*></a>(.*)<a.*?>\\.▶</a>")
                        .toString();
                //System.out.println("作者簡介:"+authorInfo);
                if (authorInfo != "0") {
                    article.setAuthorInfo(authorInfo);
                }
            }
            //標題
            String title = html.xpath("div[@class='son1']/h1/text()")
                    .toString();
            //System.out.println("標題:"+title);
            article.setTitle(title);
            //原文
            String content = html
                    .xpath("//div[@class='shileft']/div[@class='son2']")
                    .regex("<p style=\"margin\\-top:0px;\">\\&nbsp;</p>\\s+(.*?)<br>\\s+" +
                            "<strong><span style=\"color:#FFFFFF;background-color:#E53333;\">精彩推薦</span></strong>")
                    .toString();
            //System.out.println("原文:" + content);
            article.setContent(content);
            //譯文連結
            String translateUrl = html
                    .xpath("div[@class='son5']").links()
                    .regex(URL_TRANSLATION).toString();
            //賞析連結
            String appreciationUrl = html
                    .xpath("div[@class='son5']").links()
                    .regex(URL_APPRECIATION).toString();
            //System.out.println("翻譯:" + translateUrl);
            //System.out.println("賞析:" + appreciationUrl);
            if (translateUrl == null && appreciationUrl == null) {
                //如果沒有譯文和賞析,則直接儲存該Article物件
                saveArticle(article, page);
            } else {
                //否則,則把Article存在articleMap,等待資訊被補齊才儲存
                if (translateUrl != null) {
                    article.setTranslation(translateUrl);
                    page.addTargetRequest(translateUrl);
                }
                if (appreciationUrl != null) {
                    article.setAppreciation(appreciationUrl);
                    page.addTargetRequest(appreciationUrl);
                }
                articleMap.put(articleUrl, article);
                page.setSkip(true);//跳過這個頁面
            }
        } else if (page.getUrl().regex(URL_TRANSLATION).match()) {
            Html html = page.getHtml();
            String articleUrl = html
                    .xpath("//div[@class='sontitle']/span/a/@href")
                    .toString();
            //System.out.println("詩詞的連結"+articleUrl);
            //翻譯標題
            String translationTitle = html
                    .xpath("//div[@class='shileft']/div[@class='son1']/h1/text()")
                    .toString();
            //System.out.println("翻譯標題:" + translationTitle);
            String translation = null;
            String comment = null;
            if (translationTitle.endsWith("譯文及註釋")) {
                translation = html
                        .xpath("//div[@class='shangxicont']")
                        .regex("<p><strong>譯文.*?</strong>(.*?)</p>")
                        .toString();
                if (translation != null)//去掉無關內容
                    translation = translation.replaceAll("</?a.*?>", "");
                comment = html
                        .xpath("//div[@class='shangxicont']")
                        .regex("<p><strong>註釋.*?</strong>(.*?)</p>")
                        .toString();
                if (comment != null)
                    comment = comment.replaceAll("</?a.*?>", "");
                if (translation == null && comment == null) {
                    //譯文和註釋被合併在了一起
                    translation = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (translation != null)
                        translation = translation.replaceAll("</?a.*?>", "");
                }
            } else {
                //只有譯文
                if (translationTitle.endsWith("譯文")) {
                    translation = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (translation != null)//去掉無關內容
                        translation = translation.replaceAll("</?a.*?>", "");
                }
                //只有註釋
                if (translationTitle.endsWith("註釋")) {
                    comment = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (comment != null) {
                        comment = comment.replaceAll("</?a.*?>", "");
                    }
                }
            }
            //System.out.println("翻譯:" + translation);
            //System.out.println("註釋:" + comment);
            Article article = articleMap.get(articleUrl);
            article.setTranslation(translation);
            article.setComment(comment);
            String appreciation = article.getAppreciation();
            if (appreciation != null && appreciation.startsWith("http")) {
                page.setSkip(true);//跳過這個頁面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        } else if (page.getUrl().regex(URL_APPRECIATION).match()) {
            Html html = page.getHtml();
            String articleUrl = html
                    .xpath("//div[@class='sontitle']/span/a/@href")
                    .toString();
            //System.out.println("詩詞的連結" + articleUrl);
            String appreciateTitle = html
                    .xpath("//div[@class='shileft']/div[@class='son1']/h1")
                    .toString();
            //System.out.println(appreciateTitle);
            String appreciation = html
                    .xpath("//div[@class='shangxicont']")
                    .regex("<p.*>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                    .toString();
            if (appreciation != null)
                appreciation = appreciation.replaceAll("</?a.*?>", "");
            //System.out.println("賞析:" + appreciation);
            Article article = articleMap.get(articleUrl);
            article.setAppreciation(appreciateTitle + appreciation);
            String translation = article.getTranslation();
            if (translation != null && translation.startsWith("http")) {
                page.setSkip(true);//跳過這個頁面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        }
    }

    /**
     * 獲取article總數
     **/
    public static int articleCount() {
        return articleType.size();
    }

    public static void main(String[] args) {
        HaoShiWenPageProcessor processor = new HaoShiWenPageProcessor();
        Spider.create(processor)//指定PageProcessor頁面處理器
                .addUrl(intiUrls())//新增爬取連結
                //.addUrl("http://www.haoshiwen.org/view.php?id=47834")
                //指定Pipeline結果處理物件,這裡把結果儲存成JSON檔案
                // 預設儲存到/data/webmagic,這裡儲存資料到/data下
                // 使用自定義的jsonFilePipeline來儲存json資料,以詩歌ID來命名json檔案
                // 預設的jsonFilePipeline是以連結的Url經過md5加密後來命名json檔案的,如果重複下載的話,會出現重複的檔案
                .addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
                .thread(5)//指定執行緒數
                .run();//開始爬蟲
        System.out.println("詩詞總數有:" + processor.articleCount());//75604
        System.out.println("執行結束");
    }

}

自定義JsonFilePipe儲存json資料

由於使用webmagic預設的JsonFilePipe生成的json檔案的檔名是使用MD5對檔案進行命名的,生成的json檔案無法從檔名上和網站的詩文連結進行一一對應,所以這裡自定義JsonFilePipe,設定儲存的json檔名為詩文的ID,方便查詢原始的詩文內容。

package edu.nlp.pipeline;

import com.alibaba.fastjson.JSON;

import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;

public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
    private Logger logger = LoggerFactory.getLogger(this.getClass());

    public JsonFilePipeline() {
        this.setPath("/data/webmagic");
    }

    public JsonFilePipeline(String path) {
        this.setPath(path);
    }

    public void process(ResultItems resultItems, Task task) {
        String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
        try {
            // 用articleId來命名檔名
            PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + resultItems.get("articleId") + ".json")));
            //PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
            printWriter.write(JSON.toJSONString(resultItems.getAll()));
            printWriter.close();
        } catch (IOException var5) {
            this.logger.warn("write file error", var5);
        }
    }
}

執行程式後,生成的資料如下:

在這裡插入圖片描述