1. 程式人生 > >java根據網頁URL獲取正文資訊,並調整正文格式為段落顯示---(兩種方式)

java根據網頁URL獲取正文資訊,並調整正文格式為段落顯示---(兩種方式)

WebCollector的正文抽取API都被封裝為ContentExtractor類的靜態方法。可以抽取結構化新聞,也可以只抽取網頁的正文(或正文所在Element)。

需要了解的兩個類 :

  • ContentExtractor : 封裝了正文抽取演算法和正文抽取的API,正文抽取API都被封裝為ContentExtractor類的靜態方法
  • News : 結構化新聞對應的模型
package spiderWorker.testWebCollector;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;

import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;

public class testdemo1 {

	 /** *alt+shift+j
     * 通過網站域名URL獲取該網站的原始碼 HTMl檔案
     * @param url 
     * @return String 
     * @throws Exception 
     */  
    public static String getURLSource(URL url) throws Exception    {  
        HttpURLConnection conn = (HttpURLConnection)url.openConnection();  
        conn.setRequestMethod("GET");  
        conn.setConnectTimeout(5 * 1000);  
        InputStream inStream =  conn.getInputStream();  //通過輸入流獲取html二進位制資料  
        byte[] data = readInputStream(inStream);        //把二進位制資料轉化為byte位元組資料  
        String htmlSource = new String(data);  
        return htmlSource;  
    }  
    /** 
     * 把二進位制流轉化為byte位元組陣列 
     * @param instream 
     * @return byte[] 
     * @throws Exception 
     */  
    public static byte[] readInputStream(InputStream instream) throws Exception {  
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();  
        byte[]  buffer = new byte[1204];  
        int len = 0;  
        while ((len = instream.read(buffer)) != -1){  
            outStream.write(buffer,0,len);  
        }  
        instream.close();  
        return outStream.toByteArray();           
    }  
    
	public static void main(String[] args) throws Exception {
		URL url = new URL("http://www.sohu.com/a/154612018_555775"); //有給定的URL,得到html原始碼快照檔案,不進行額外儲存,直接進行正文格式變換和顯示        
		String urlsource = getURLSource(url);  
        System.out.println(urlsource);  
		
        News news = ContentExtractor.getNewsByHtml(urlsource);   //需要使用到WebCollector包:<dependency> <groupId>cn.edu.hfut.dmic.webcollector</groupId> <artifactId>WebCollector</artifactId> <version>2.52</version>  </dependency>
	String content = " "+news.getContent();
	String time = news.getTime();
	String title = news.getTitle();
	content  = content.replaceAll(" ", "\r\n\t");
	System.out.println(title);
	System.out.println(time);
	System.out.println(content);
		
		
		/*File file = new File("C:\\Users\\admin\\Desktop\\test1.txt"); //先將HTML檔案儲存在檔案中,再讀檔案,進行正文格式變換和顯示
		
		String encoding="UTF-8";
		InputStreamReader read = new InputStreamReader(
					                    new FileInputStream(file),encoding);//考慮到編碼格式
		BufferedReader bufferedReader = new BufferedReader(read);
		 
		StringBuilder sb = new StringBuilder();
        	String lineTxt = null;
        	while((lineTxt = bufferedReader.readLine()) != null){
            	sb.append(lineTxt);
        	}
		
		try {
			News news = ContentExtractor.getNewsByHtml(sb.toString());
			String content = " "+news.getContent();
			String time = news.getTime();
			String title = news.getTitle();
			content  = content.replaceAll(" ", "\r\n\t");
			System.out.println(title);
			System.out.println(time);
			System.out.println(content);
		} catch (Exception e) {
			e.printStackTrace();
		}
		*/
	}

}