1. 程式人生 > >基於Java的網路爬蟲實現抓取網路小說(一)

基於Java的網路爬蟲實現抓取網路小說(一)

package novel.spider.impl;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import novel.spider.entity.Chapter;
import novel.spider.interfaces.IChapterInter;

/**
 * 小說url介面實現類
 * 
 * @author lilonghua
 * @date: 2017年6月22日
 */
public class IChapterInterImpl implements IChapterInter {

	protected String crawl(String url) throws Exception {
		//採用HttpClient技術
		try (CloseableHttpClient httpClient = HttpClientBuilder.create().build();
			 CloseableHttpResponse httpResponse = httpClient.execute(new HttpGet(url))) {
			String result = EntityUtils.toString(httpResponse.getEntity());
			return result;
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}
	

	@Override
	public List<Chapter> getChapter(String url) {
		try {
			String result = crawl(url);
			Document doc = Jsoup.parse(result);
			Elements as = doc.select("#list dd a");
			List<Chapter> chapters = new ArrayList<>();
			for (Element a : as) {
				Chapter chapter = new Chapter();
				chapter.setTitle(a.text());
				chapter.setUrl("http://www.bxwx8.org" + a.attr("href"));
				chapters.add(chapter);
			}
			return chapters;
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}


}
  • 最後是測試,我們需要在test下建立測試類TestOne.java
package novel.spider.test;

import java.util.List;

import org.junit.Test;

import novel.spider.entity.Chapter;
import novel.spider.impl.IChapterInterImpl;

/**
 * 測試連結
 * @author lilonghua
 * @date: 2017年6月22日
 */
public class TestOne {

	@Test
	public void test1(){
		IChapterInterImpl  ChapterInterImpl = new IChapterInterImpl();
		List<Chapter> chapterList = ChapterInterImpl.getChapter("http://www.biquge.tw/0_5/");
		for (Chapter chapter : chapterList) {
			System.out.println(chapter);
		}
	}
}
  • 測試結果,完美實現