java.net.*爬取網頁,Jsoup解析網頁內容
阿新 • • 發佈:2020-07-31
java.net.* 建立網路連線
Jsoup解析網頁內容
package com.sun.util; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter;import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class DataDownUtil { /** * @author UPO * @param url * @paramencoding * @return String 網頁的原始碼 * <a href="http://www.baidu.com">百度</a> * <a href="https://movie.douban.com/subject/3168101/comments?start=0&limit=20&sort=new_score&status=P">爬取的網頁</a> * */ public static String getHtmlResourceByUrl(String url,String encoding){ StringBuffer buffer=new StringBuffer(); URL urlobj=null; URLConnection uc=null; InputStreamReader isr=null; BufferedReader reader=null; try { //建立網路連線 urlobj=new URL(url); //開啟網路 uc=urlobj.openConnection(); //建立檔案輸入流的物件 isr=new InputStreamReader(uc.getInputStream(), encoding); //建立檔案緩衝寫入流(相當於ctrl+v放入記憶體中) reader=new BufferedReader(isr); //建立臨時變數 String temp=null; while((temp=reader.readLine())!=null){ buffer.append(temp); //buffer.append("\n"); } } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("網路連線不可用"); }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("網路連線失敗"); }finally { if(isr!=null){ try { isr.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return buffer.toString(); } public static String getContext(){ StringBuffer context=new StringBuffer(); int start=0; while(start>=0&&start<=60){ //檢視網頁url位址列 String url="https://movie.douban.com/subject/26266893/comments?start="+start+"&limit=20&sort=new_score&status=P"; String encoding="utf-8"; //觀察可知每一頁載入20個評價item start=start+20; //1.獲取網頁原始碼 String html=getHtmlResourceByUrl(url, encoding); //System.out.println(html); //2.解析 Document document=Jsoup.parse(html); //3.最外層的id是:comments Element element=document.getElementById("comments"); //4.裡面的每一個item的id是:comment-item Elements elements=element.getElementsByClass("comment-item"); for (Element ele : elements) { //https://movie.douban.com/subject/3168101/comments?start=20&limit=20&sort=new_score&status=P String name=ele.getElementsByTag("a").last().text(); String desc=ele.getElementsByClass("short").text(); String time=ele.getElementsByClass("comment-time").text(); String votes=ele.getElementsByClass("votes").text(); //System.out.println("\nname:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes); context.append("\n"); context.append("name:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes); context.append("\n"); } } System.out.println(context); return context.toString(); } /** * 將檔案一行行寫入到檔案中 * @author 孫敬欽 * @version 1.0 * @param content 解析到的檔案內容 * @param filePath 儲存的檔名字 * @return void */ public static void writeFileByLine(String context,String filePath){ File file=new File(filePath); PrintWriter printWriter=null;; try { printWriter=new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); printWriter.print(context); printWriter.flush(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { //關閉printWriter if(printWriter!=null){ printWriter.close(); } } } public static void main(String[] args) { System.out.println("你好阿泡"); //1.得到解析的網頁資料 String context=getContext(); System.out.println(context); //2.儲存到txt檔案 String filePath="D:/movie/bigdata.txt"; writeFileByLine(context, filePath); //3.儲存到hdfs檔案系統 } }