基於Java的網路爬蟲實現抓取網路小說(一)
阿新 • • 發佈:2019-01-24
package novel.spider.impl; import java.util.ArrayList; import java.util.List; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import novel.spider.entity.Chapter; import novel.spider.interfaces.IChapterInter; /** * 小說url介面實現類 * * @author lilonghua * @date: 2017年6月22日 */ public class IChapterInterImpl implements IChapterInter { protected String crawl(String url) throws Exception { //採用HttpClient技術 try (CloseableHttpClient httpClient = HttpClientBuilder.create().build(); CloseableHttpResponse httpResponse = httpClient.execute(new HttpGet(url))) { String result = EntityUtils.toString(httpResponse.getEntity()); return result; } catch (Exception e) { throw new RuntimeException(e); } } @Override public List<Chapter> getChapter(String url) { try { String result = crawl(url); Document doc = Jsoup.parse(result); Elements as = doc.select("#list dd a"); List<Chapter> chapters = new ArrayList<>(); for (Element a : as) { Chapter chapter = new Chapter(); chapter.setTitle(a.text()); chapter.setUrl("http://www.bxwx8.org" + a.attr("href")); chapters.add(chapter); } return chapters; } catch (Exception e) { throw new RuntimeException(e); } } }
- 最後是測試,我們需要在test下建立測試類TestOne.java
package novel.spider.test; import java.util.List; import org.junit.Test; import novel.spider.entity.Chapter; import novel.spider.impl.IChapterInterImpl; /** * 測試連結 * @author lilonghua * @date: 2017年6月22日 */ public class TestOne { @Test public void test1(){ IChapterInterImpl ChapterInterImpl = new IChapterInterImpl(); List<Chapter> chapterList = ChapterInterImpl.getChapter("http://www.biquge.tw/0_5/"); for (Chapter chapter : chapterList) { System.out.println(chapter); } } }
- 測試結果,完美實現