java爬蟲-0020,httpclient獲取原始碼
阿新 • • 發佈:2018-12-13
1、匯入httpclient依賴
<dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.3</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient-cache</artifactId> <version>4.3</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpmime</artifactId> <version>4.3</version> </dependency>
2、封裝請求物件
/** * Created by rongyaowen on 2018/10/4. * 請求封裝,get請求,post請求。 */ public class Request { private static CloseableHttpClient closeableHttpClient = HttpClientBuilder.create().build(); /** * get 請求 * * @param url * @param headerParams 請求頭 * @return */ public static Map<String, Object> get(String url, Map<String, Object> headerParams) { HttpGet httpGet = new HttpGet(url); Map<String, Object> logMap = new HashMap<>(); logMap.put("請求連結", url); return response(httpGet, headerParams, logMap); } /** * post 請求 * * @param url * @param headerParams 請求頭 * @param requestParams 請求資料 * @return */ public static Map<String, Object> post(String url, Map<String, Object> headerParams, Map<String, Object> requestParams) { HttpPost httpPost = new HttpPost(url); StringEntity entity = null; try { String requestParamsStr = null; if (!requestParams.isEmpty() && !StringUtils.isEmpty(requestParamsStr = requestParams.get(P.REQUEST.REQUEST_PARAMS).toString())) { entity = new StringEntity(requestParamsStr); } String contentTypeStr = null; if (!requestParams.isEmpty() && !StringUtils.isEmpty(contentTypeStr = requestParams.get(P.REQUEST.CONTENT_TYPE).toString())) { // 表單格式資料 entity.setContentType(contentTypeStr); } httpPost.setEntity(entity); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } Map<String, Object> logMap = new HashMap<>(); logMap.put("親求連結", url); logMap.put("請求引數", requestParams); return response(httpPost, headerParams, logMap); } /** * 請求 * * @param httpRequestBase * @param headerParams 請求頭 * @param logMap 日誌map * @return */ private static Map<String, Object> response(HttpRequestBase httpRequestBase, Map<String, Object> headerParams, Map<String, Object> logMap) { Map<String, Object> resMap = new HashMap<>(); RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000) .setSocketTimeout(5000).build(); httpRequestBase.setConfig(config); // 拼裝請求頭 if (!headerParams.isEmpty()) { for (Map.Entry<String, Object> entry : headerParams.entrySet()) { httpRequestBase.addHeader(entry.getKey(), entry.getValue().toString()); } } try { HttpResponse httpResponse = closeableHttpClient.execute(httpRequestBase); // 狀態碼 int statusCode = httpResponse.getStatusLine().getStatusCode(); logMap.put("請求頭", headerParams); logMap.put("狀態碼", statusCode); logMap.put("請求方法", httpRequestBase.getMethod()); LogUtil.debug(LogUtil.mapToStr(logMap)); // 返回響應body資料 HttpEntity entity = httpResponse.getEntity(); String resBody = EntityUtils.toString(entity, "utf-8"); // 響應頭 Header[] headers = httpResponse.getAllHeaders(); // 組裝響應 resMap.put(P.REQUEST.RES_BODY, resBody); resMap.put(P.REQUEST.HEADERS, headers); } catch (IOException e) { e.printStackTrace(); } return resMap; } /** * 獲取請求流 * * @param url * @param headerParams * @return */ public static InputStream getAuthCode(String url, Map<String, Object> headerParams) { RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000) .setSocketTimeout(5000).build(); HttpGet httpGet = new HttpGet(url); httpGet.setConfig(config); // 拼裝請求頭 if (!headerParams.isEmpty()) { for (Map.Entry<String, Object> entry : headerParams.entrySet()) { httpGet.addHeader(entry.getKey(), entry.getValue().toString()); } } HttpResponse httpResponse = null; try { httpResponse = closeableHttpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); Map<String, Object> logMap = new HashMap<>(); logMap.put("請求連結", url); logMap.put("請求頭", headerParams); logMap.put("請求方法", httpGet.getMethod()); logMap.put("請求狀態", statusCode); LogUtil.debug(LogUtil.mapToStr(logMap)); if (statusCode == HttpStatus.SC_OK) { HttpEntity entity = httpResponse.getEntity(); return entity.getContent(); } } catch (IOException e) { e.printStackTrace(); } return null; } }
3、獲取豆瓣未登入主頁內容
首頁在谷歌的開發者工具中,拿到User-Agent的頭資訊(沒有這個資訊,會被伺服器判定為爬蟲)
4、模擬傳送請求獲取主頁內容
/** * 第一個爬蟲程式,獲取原始碼,注意需要帶上User_Agetn */ @Test public void crawlerClient_01() { String url = "https://www.douban.com"; Map<String, Object> headerParams = new HashMap<>(); headerParams.put(P.REQUEST.USER_AGENT, P.USER_AGENT); Map<String, Object> resMap = Request.get(url, headerParams); System.out.println(resMap.get(P.REQUEST.RES_BODY)); }
5、效果展示