java爬蟲的幾個例項
阿新 • • 發佈:2018-12-25
單個網頁爬取
package redis.list; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; /** * 訪問單個新聞頁: * http://www.huxiu.com/article/102062/1.html 需要:標題,內容 */ public class SingleArticle { public static void main(String[] args) throws Exception { // 第一步:訪問頁面 String url = "http://www.huxiu.com/article/102062/1.html"; Document document = Jsoup.connect(url).get(); // 第二步:解析頁面 Elements titleElements = document.getElementsByTag("title"); String title = titleElements.get(0).text(); Elements elements = document.select("div #article_content"); String content = elements.text(); // 第三步:列印 System.out.println("title:" + title); System.out.println("content:" + content); } }
爬網站上的文章
package redis.list; import org.jsoup.Jsoup; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import redis.clients.jedis.Jedis; import java.util.Date; import java.util.List; public class Crawler { //定義需要爬取的url list private static final String redisUrlsWillKey = "crawler:urls:will"; //如果需要去重的話,可以使用set儲存已經爬起過的url public static void main(String[] args) throws Exception { //準備Url String startUrl = "https://www.huxiu.com"; String domain = "http://www.huxiu.com/"; //獲取文章Url getUrls(startUrl, domain); //處理url,下載文章的內容並列印 parserUrl(); } private static void parserUrl() throws Exception { Jedis jedis = new Jedis("127.0.0.1", 6379); //從右邊彈出一個url while (true) { String url = jedis.rpop(redisUrlsWillKey); try { Article article = parser(url); System.out.println("++++++++"+article); } catch (Exception e) { // jedis.lpush(redisUrlsWillKey, url); } } } private static Article parser(String url) throws Exception { Document articleDocument = Jsoup.connect(url).get(); Article article = new Article(); // 封裝作者的資訊 Elements author = articleDocument .getElementsByClass("author-name"); article.setAuthor(StringUtil.isBlank(author.text()) ? "jingzhongyue" : author.text()); // 抽取文章日期 Elements date = articleDocument .getElementsByClass("article-time"); article.setDate(StringUtil.isBlank(date.text()) ? new Date() : DateUtil.getDate(date.text())); // 抽取文章標題 Elements title = articleDocument.getElementsByTag("title"); article.setTitle(title.text()); // 抽取文章編號 // http://www.huxiu.com/article/124698/1.html String id = url.substring(29); int index = id.indexOf("/"); id = id.substring(0, index); article.setId(id); // 抽取文章正文 StringBuffer stringBuffer = new StringBuffer(); Elements contents = articleDocument .getElementsByAttribute("id"); for (Element element : contents) { String idTag = element.attr("id"); if ("article_content".equals(idTag)) { Elements childs = element.children(); Elements pElements = childs.tagName("p"); for (Element element2 : pElements) { stringBuffer.append(element2.text()); } } } return article; } private static void getUrls(String startUrl, String domain) throws Exception { Jedis jedis = new Jedis("127.0.0.1", 6379); Document document = Jsoup.connect(startUrl).get(); Elements elements = document.getElementsByAttribute("href"); for (Element element : elements) { String endUrl = element.attr("href"); if (endUrl.contains("article")) { String url = domain + endUrl; System.out.println(url); jedis.lpush(redisUrlsWillKey, url); } } } }