無搜尋條件根據url獲取網頁資料(java爬取網頁資料)
阿新 • • 發佈:2018-11-30
jsoup jar包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 根據URL獲得所有的html資訊 * @param url * @return * @throws IOException * @throws ClientProtocolException */ public static String getHtmlByUrl(String url) throws ClientProtocolException, IOException{ String html = null; //建立httpClient物件 CloseableHttpClient httpClient = HttpClients.createDefault(); //以get方式請求該URL HttpGet httpget = new HttpGet(url); CloseableHttpResponse response = httpClient.execute(httpget); try { //得到responce物件 //HttpResponse responce = httpClient.execute(httpget); //返回碼 int resStatu = response.getStatusLine().getStatusCode(); if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不對 //獲得輸入流 InputStream entity = response.getEntity().getContent(); if (entity!=null) { //通過輸入流轉為字串獲得html原始碼 注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html //但是有可能出現亂碼,因此在這裡採用了這種方式 html=getStreamString(entity); // System.out.println(html); } } } catch (Exception e) { //System.out.println("訪問【"+url+"】出現異常!"); e.printStackTrace(); } finally { //httpClient.getConnectionManager().shutdown(); response.close(); try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } return html; } /** * 將一個輸入流轉化為字串 */ public static String getStreamString(InputStream tInputStream){ if (tInputStream != null){ try{ BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream,"gb2312")); StringBuffer tStringBuffer = new StringBuffer(); String sTempOneLine = new String(""); while ((sTempOneLine = tBufferedReader.readLine()) != null){ tStringBuffer.append(sTempOneLine+"\n"); } return tStringBuffer.toString(); }catch (Exception ex){ ex.printStackTrace(); } } return null; } public static void main(String[] args) throws ClientProtocolException, IOException { String htmlByUrl = getHtmlByUrl(url); if(htmlByUrl!=null&&!"".equals(htmlByUrl)) { //解析內容 Document doc = Jsoup.parse(htmlByUrl); } }