java根據網頁URL獲取正文資訊,並調整正文格式為段落顯示---(兩種方式)
阿新 • • 發佈:2019-02-15
WebCollector的正文抽取API都被封裝為ContentExtractor類的靜態方法。可以抽取結構化新聞,也可以只抽取網頁的正文(或正文所在Element)。
需要了解的兩個類 :
- ContentExtractor : 封裝了正文抽取演算法和正文抽取的API,正文抽取API都被封裝為ContentExtractor類的靜態方法
- News : 結構化新聞對應的模型
package spiderWorker.testWebCollector; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URL; import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import cn.edu.hfut.dmic.contentextractor.News; public class testdemo1 { /** *alt+shift+j * 通過網站域名URL獲取該網站的原始碼 HTMl檔案 * @param url * @return String * @throws Exception */ public static String getURLSource(URL url) throws Exception { HttpURLConnection conn = (HttpURLConnection)url.openConnection(); conn.setRequestMethod("GET"); conn.setConnectTimeout(5 * 1000); InputStream inStream = conn.getInputStream(); //通過輸入流獲取html二進位制資料 byte[] data = readInputStream(inStream); //把二進位制資料轉化為byte位元組資料 String htmlSource = new String(data); return htmlSource; } /** * 把二進位制流轉化為byte位元組陣列 * @param instream * @return byte[] * @throws Exception */ public static byte[] readInputStream(InputStream instream) throws Exception { ByteArrayOutputStream outStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1204]; int len = 0; while ((len = instream.read(buffer)) != -1){ outStream.write(buffer,0,len); } instream.close(); return outStream.toByteArray(); } public static void main(String[] args) throws Exception { URL url = new URL("http://www.sohu.com/a/154612018_555775"); //有給定的URL,得到html原始碼快照檔案,不進行額外儲存,直接進行正文格式變換和顯示 String urlsource = getURLSource(url); System.out.println(urlsource); News news = ContentExtractor.getNewsByHtml(urlsource); //需要使用到WebCollector包:<dependency> <groupId>cn.edu.hfut.dmic.webcollector</groupId> <artifactId>WebCollector</artifactId> <version>2.52</version> </dependency> String content = " "+news.getContent(); String time = news.getTime(); String title = news.getTitle(); content = content.replaceAll(" ", "\r\n\t"); System.out.println(title); System.out.println(time); System.out.println(content); /*File file = new File("C:\\Users\\admin\\Desktop\\test1.txt"); //先將HTML檔案儲存在檔案中,再讀檔案,進行正文格式變換和顯示 String encoding="UTF-8"; InputStreamReader read = new InputStreamReader( new FileInputStream(file),encoding);//考慮到編碼格式 BufferedReader bufferedReader = new BufferedReader(read); StringBuilder sb = new StringBuilder(); String lineTxt = null; while((lineTxt = bufferedReader.readLine()) != null){ sb.append(lineTxt); } try { News news = ContentExtractor.getNewsByHtml(sb.toString()); String content = " "+news.getContent(); String time = news.getTime(); String title = news.getTitle(); content = content.replaceAll(" ", "\r\n\t"); System.out.println(title); System.out.println(time); System.out.println(content); } catch (Exception e) { e.printStackTrace(); } */ } }