1. 程式人生 > >無搜尋條件根據url獲取網頁資料(java爬取網頁資料)

無搜尋條件根據url獲取網頁資料(java爬取網頁資料)

jsoup jar包

<dependency>
 	<groupId>org.jsoup</groupId>
  	<artifactId>jsoup</artifactId>
  	<version>1.11.3</version>
</dependency>

 

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


/**
     * 根據URL獲得所有的html資訊
     * @param url
     * @return
	 * @throws IOException 
	 * @throws ClientProtocolException 
     */

	public static String getHtmlByUrl(String url) throws ClientProtocolException, IOException{
        String html = null;
        //建立httpClient物件
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //以get方式請求該URL
        HttpGet httpget = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpget);
        try {
            //得到responce物件
            //HttpResponse responce = httpClient.execute(httpget);
            //返回碼
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不對
                //獲得輸入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通過輸入流轉為字串獲得html原始碼  注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html
                	//但是有可能出現亂碼,因此在這裡採用了這種方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("訪問【"+url+"】出現異常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
    }

    /**
    * 將一個輸入流轉化為字串
    */
    public static String getStreamString(InputStream tInputStream){
        if (tInputStream != null){
        try{
	        BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream,"gb2312"));
	        StringBuffer tStringBuffer = new StringBuffer();
	        String sTempOneLine = new String("");
        while ((sTempOneLine = tBufferedReader.readLine()) != null){
                tStringBuffer.append(sTempOneLine+"\n");
        }
            return tStringBuffer.toString();
        }catch (Exception ex){
            ex.printStackTrace();
        }
       }
         return null;
    }


 public static void main(String[] args) throws ClientProtocolException, IOException {
    	String htmlByUrl = getHtmlByUrl(url);
    	if(htmlByUrl!=null&&!"".equals(htmlByUrl)) {
            //解析內容
    		Document doc = Jsoup.parse(htmlByUrl);
        }
	}