訪問url獲取頁面內容工具類
阿新 • • 發佈:2018-12-12
package com.guanyong.fbimonitor.test; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.security.cert.CertificateException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.TimeZone; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.InflaterInputStream; import javax.net.ssl.SSLContext; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.NameValuePair; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.client.utils.URIUtils; import org.apache.http.config.ConnectionConfig; import org.apache.http.config.MessageConstraints; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.cookie.Cookie; import org.apache.http.entity.BasicHttpEntity; import org.apache.http.entity.ContentType; import org.apache.http.entity.StringEntity; import org.apache.http.impl.DefaultHttpResponseFactory; import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.BasicAuthCache; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.conn.DefaultHttpClientConnectionOperator; import org.apache.http.impl.conn.DefaultHttpResponseParser; import org.apache.http.impl.conn.DefaultHttpResponseParserFactory; import org.apache.http.impl.conn.ManagedHttpClientConnectionFactory; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.io.HttpMessageParser; import org.apache.http.io.HttpMessageParserFactory; import org.apache.http.io.SessionInputBuffer; import org.apache.http.message.BasicLineParser; import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.TrustStrategy; import org.apache.http.util.ByteArrayBuffer; import org.apache.http.util.CharArrayBuffer; import org.apache.http.util.EntityUtils; public class FetchWebData { public static final Charset gbkCharset = Charset.forName("GBK"); public static final Charset utf8Charset = Charset.forName("UTF-8"); public static final Charset iso8859Charset = Charset.forName("ISO8859-1"); // protected static final Logger LOGGER = LoggerFactory.getLogger(FetchWebData.class); // protected static final Logger LOGGER = null; // meant to be used by a single thread only! In particular httpCtx is not thread-safe private final BasicAuthCache authCache = new BasicAuthCache(); private final HttpClientContext httpCtx; // used across several requests sent with this fd private final BasicCredentialsProvider credProvider = new BasicCredentialsProvider(); private final BasicCookieStore cookieStore = new BasicCookieStore(); private final CloseableHttpClient client; public static final String defaultUserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0"; private String userAgent = defaultUserAgent; private Charset forceCharset = null; // if non-null, will be used in request and response entities; if null, reasonable defaults will be used. private String cookieSpec = CookieSpecs.DEFAULT; // FIXME: what about SINGLE_COOKIE_HEADER? private String strCookie; private int RetryNum; private int timeoutMs; // each FetchWebData instance can have its own timeout in milliseconds private Map<String,String> extraHeaders; protected boolean isVerboseUrl = true; // whether each url passed here is shown public void setVerboseUrl(boolean value) { this.isVerboseUrl = value; } private final Integer maxReadMillSeconds =4*60*1000;//一次http最多可以允許的毫秒數 private boolean isFilterApplicationResponse; private String redirectUrl; private String refererUrl; public String lastModified; private boolean shouldTryOtherProxy = true;//是否需要嘗試其他的代理,有些不需要 // private int maxPageSize= 3*1024*1024;//頁面最大大小 private String contentFileType ;//二進位制檔案型別 private Integer minTryNumber = 1; public String getContentFileType() { return contentFileType; } public void setContentFileType(String contentFileType) { this.contentFileType = contentFileType; } public void setReferer(String refererUrl){ this.refererUrl = refererUrl; } public String getRedirectUrl(){ return redirectUrl; } public String getLastRequestUrl(){ return lastRedirURL; } public boolean isFilterApplicationResponse() { return isFilterApplicationResponse; } public void setFilterApplicationResponse(boolean isFilterApplicationResponse) { this.isFilterApplicationResponse = isFilterApplicationResponse; } public boolean isShouldTryOtherProxy() { return shouldTryOtherProxy; } public void setShouldTryOtherProxy(boolean shouldTryOtherProxy) { this.shouldTryOtherProxy = shouldTryOtherProxy; } public void setRetryNum(int retryNum){ if(retryNum < minTryNumber){ retryNum = minTryNumber; } this.RetryNum = retryNum; } public void setMinRetryNum(int minTryNumber){ if(minTryNumber < 1){ minTryNumber = 1; } this.minTryNumber = minTryNumber; } private HashMap<Integer, Boolean> httpRetryModeMap = new HashMap<>(); public void setRetryMode(int code, boolean shouldRetry) { httpRetryModeMap.put(code, shouldRetry); } {setRetryMode(404, false); } private HashSet<Integer> acceptedHttpStatuses = new HashSet<>(); // http status codes in this set are considered ok, and the entity content is returned by getGzipContentAPI public void addAcceptedHttpStatus(int code) { acceptedHttpStatuses.add(code); } // NOTE: Although HttpState.getCookies() currently returns a newly created cookie array, the underlying cookies may still be modified. // Consequently, the cookie information should be consumed immediately after issuing a request, usually by using getCookiesTxtStr() instead. //public Cookie[] getCookies() { return cookies; } // meant to be passed to aria2c, etc. public String getCookiesTxtStr() { StringBuilder buf = new StringBuilder(); for (Cookie cookie: cookieStore.getCookies()) { // See http://blog.omnux.com/index.php/2008/03/25/cookiestxt-file-format/; escaping is handled by the application layer. buf.append(cookie.getDomain()); buf.append('\t'); buf.append("TRUE\t"); // whether the cookie is usable in subdomains buf.append(cookie.getPath()); buf.append('\t'); buf.append(cookie.isSecure() ? "TRUE" : "FALSE"); buf.append('\t'); Date expiryDate = cookie.getExpiryDate(); buf.append(expiryDate != null ? expiryDate.getTime() / 1000 : 0L); buf.append('\t'); buf.append(cookie.getName()); buf.append('\t'); buf.append(cookie.getValue()); buf.append('\n'); } return buf.toString(); } public List<Cookie> getCookies(){ return cookieStore.getCookies(); } public String getCookiesStr() { StringBuilder buf = new StringBuilder(); for (Cookie cookie: cookieStore.getCookies()) { buf.append(cookie.getName()); buf.append("="); buf.append(cookie.getValue()); buf.append("; "); } return buf.toString(); } // All connections from this manager are not affected by Heritrix's recording facilities, which allows // only one open connection (even free ones) per thread. // Now that we are no longer using Heritrix, MultiThreadedHttpConnectionManager should fit our needs. public static class CrawlerHttpClientConnectionManager extends PoolingHttpClientConnectionManager { private static Registry<ConnectionSocketFactory> getSocketFactoryRegistry() { // same as that used in the parent class //trust all host SSLConnectionSocketFactory sslsf =null; try { SSLContext sslContext = new SSLContextBuilder() .loadTrustMaterial(null, new TrustStrategy() { @Override public boolean isTrusted( java.security.cert.X509Certificate[] chain,String authType) throws CertificateException { return true; } }).build(); sslsf = new SSLConnectionSocketFactory(sslContext); } catch (Exception e) { sslsf = SSLConnectionSocketFactory.getSocketFactory(); } return RegistryBuilder.<ConnectionSocketFactory>create() .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", sslsf) .build(); } private static final HttpMessageParserFactory<HttpResponse> myParserFactory = new DefaultHttpResponseParserFactory() { @Override public HttpMessageParser<HttpResponse> create(final SessionInputBuffer buffer, final MessageConstraints constraints) { return new DefaultHttpResponseParser(buffer, BasicLineParser.INSTANCE, DefaultHttpResponseFactory.INSTANCE, constraints) { // when the http server returns garbage (e.g. http://livestream.freshfm.com.au:8004/;stream.mp3), we don't want to wait indefinitely private int ngarbage = 0; @Override protected boolean reject(CharArrayBuffer line, int count) { ngarbage += (line.length() + 1); // line does not include the line delimiter if (ngarbage >= 16384 || count >= 256) return true; // don't accept any more garbage return false; } }; } }; private static Long timeToLiveSeconds = 60L;//一個連結最多可以保留的有效秒數 private CrawlerHttpClientConnectionManager() { // The pooled connections are given 1-minute TTL just to be safe; httpclient already uses setValidateAfterInactivity() to validate the connection every 2 seconds, // so we don't need to worry about the NAT router forgetting about the connection. super(new DefaultHttpClientConnectionOperator(getSocketFactoryRegistry(), null, null), new ManagedHttpClientConnectionFactory(null, myParserFactory, null, null), timeToLiveSeconds, TimeUnit.SECONDS); this.setDefaultMaxPerRoute(2000); // essentially unlimited; we let upper levels handle the scheduling, and blocking here is bad this.setMaxTotal(2000); // must be larger than the total number of threads, but should not be too large (particularly when multiple crawler instances are being run), or we'd run out of port numbers // Connection/so timeouts are now set in CrawlerHttpClientBuilder new IdleHttpClientConnectionMonitor(this,timeToLiveSeconds); } private static class Helper { static final CrawlerHttpClientConnectionManager inst = new CrawlerHttpClientConnectionManager(); } public static CrawlerHttpClientConnectionManager getInstance() { return Helper.inst; } } private static final int globalDefaultTimeoutMs = 20000; // 20 seconds public static final RequestConfig globalDefaultReqCfg = RequestConfig.custom().setConnectTimeout(globalDefaultTimeoutMs).setSocketTimeout(globalDefaultTimeoutMs).build(); // The built HttpClient's have shared connections, thus no need to close them public static class CrawlerHttpClientBuilder extends HttpClientBuilder { public static CrawlerHttpClientBuilder create() { return new CrawlerHttpClientBuilder(); } protected CrawlerHttpClientBuilder() { super(); this.setConnectionManager(CrawlerHttpClientConnectionManager.getInstance()).setConnectionManagerShared(true); this.setDefaultConnectionConfig(ConnectionConfig.custom().setCharset(utf8Charset).build()); this.setDefaultRequestConfig(globalDefaultReqCfg); // to customize, copy from defaultReqCfg } } private boolean isNoCache = true; public void setNoCache(boolean isNoCache) { this.isNoCache = isNoCache; } public void setStrCookie(String strCookie) { this.strCookie = strCookie; } public void setEncoding(String encoding) { this.forceCharset = (encoding != null) ? Charset.forName(encoding) : null; } public void setUserAgent(String userAgent) { this.userAgent = userAgent; } public void setParams(Map<String, String> extraHeaders) { this.extraHeaders = extraHeaders; } public FetchWebData() { this(1, 3*3600*1000); } public FetchWebData(int retryNum, int timeoutMs) { // timeoutMs can be -1 to use the default // NOTE: SSL seems to be working just fine out-of-the-box // ProtocolSocketFactory fcty = new MySecureProtocolSocketFactory(); // Protocol.registerProtocol("https", new Protocol("https", fcty, 443)); this.RetryNum = retryNum; this.timeoutMs = timeoutMs; this.setRetryNum(retryNum); this.httpCtx = new HttpClientContext(); httpCtx.setAuthCache(this.authCache); this.client = CrawlerHttpClientBuilder.create().setDefaultCredentialsProvider(credProvider).setDefaultCookieStore(cookieStore).build(); } public void ignoreCookies() { this.cookieSpec = CookieSpecs.IGNORE_COOKIES; } public void acceptRfcCookies() { this.cookieSpec = CookieSpecs.STANDARD; } // public static final String proxyHost = "10.11.0.5"; // In east asia, sometimes fast but frequently unavailable public static final int defaultProxyPort = 3128; private HttpHost proxyHost = null; public void enableProxy(String ip, int port) { enableProxy(ip, port, null, null); } public void enableProxy(String ip, int port, String userName, String pwd) { proxyHost = new HttpHost(ip, port); if (userName != null && !userName.isEmpty() && pwd != null && !pwd.isEmpty()) { // proxy authentication required this.credProvider.setCredentials(new AuthScope(proxyHost), new UsernamePasswordCredentials(userName, pwd)); // enable preemptive authentication for the proxy (FIXME: need to test its usefulness?) this.authCache.put(proxyHost, new BasicScheme()); } } private RequestConfig.Builder newReqCfgB() { final RequestConfig.Builder reqCfgB = RequestConfig.copy(globalDefaultReqCfg); reqCfgB.setConnectTimeout(Math.max(timeoutMs,15*1000));//設定連線超時時間,單位毫秒 reqCfgB.setSocketTimeout(Math.max(timeoutMs,15*1000)); //請求獲取資料的超時時間,單位毫秒。 如果訪問一個介面,多少時間內無法返回資料,就直接放棄此次呼叫。 reqCfgB.setConnectionRequestTimeout(60*1000); //設定從connect Manager獲取Connection 超時時間,單位毫秒。這個屬性是新加的屬性,因為目前版本是可以共享連線池的。 // NOTE: content charset and http element charset default to UTF-8 for now, since this is used at most sites // HttpProtocolParams.setContentCharset(params, "UTF-8"); // FIXME reqCfgB.setCookieSpec(cookieSpec); if (proxyHost != null) { reqCfgB.setProxy(proxyHost); } return reqCfgB; } private void initRequestCommon(HttpRequestBase req, String referer, Consumer<RequestConfig.Builder> reqCfgC) { { final RequestConfig.Builder reqCfgB = newReqCfgB(); // req.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(10,false)); // not enabled for now; didn't see the point if (reqCfgC != null) reqCfgC.accept(reqCfgB); req.setConfig(reqCfgB.build()); } req.setHeader("User-Agent", this.userAgent); req.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); if(referer!=null&& !referer.equals("")){ req.setHeader("Referer", referer); } //req.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7"); req.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); if (isNoCache) { req.setHeader("Pragma","no-cache"); req.setHeader("Cache-Control", "max-age=0"); } if(this.strCookie!=null&& !this.strCookie.isEmpty()){ req.addHeader("Cookie", this.strCookie); } if(extraHeaders != null){ for (Map.Entry<String, String> mapEnt: extraHeaders.entrySet()) { req.setHeader(mapEnt.getKey(), mapEnt.getValue()); } } // req.setHeader("x-flash-version", "17,0,0,134"); } private HttpGet newGetRequest(String url, String referer, Consumer<RequestConfig.Builder> reqCfgC) { final HttpGet req = new HttpGet(url); initRequestCommon(req, referer, reqCfgC); req.setHeader("Accept-Encoding", "gzip"); // what is sdch? We are currently unable to handle deflate properly return req; } private HttpPost newPostRequest(String url,String referer, NameValuePair[] pairs,int flag,InputStream is,String body, Consumer<RequestConfig.Builder> reqCfgC) { final HttpPost req = new HttpPost(url); { final RequestConfig.Builder reqCfgB = newReqCfgB(); req.setConfig(reqCfgB.build()); } final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset; switch (flag) { case 2: { BasicHttpEntity entity = new BasicHttpEntity(); entity.setContent(is); entity.setContentType("application/binary"); req.setEntity(entity); break; } case 3: { StringEntity entity = new StringEntity(body, ContentType.create("application/octet-stream", entityCharset)); req.setEntity(entity); break; } case 1: { // NOTE: Arrays.asList doesn't make a copy; it is implemented by the internal class Arrays.ArrayList, which is not the ArrayList we are familiar with. UrlEncodedFormEntity entity = new UrlEncodedFormEntity(Arrays.asList(pairs), entityCharset); req.setEntity(entity); break; } default: throw new RuntimeException("Invalid flag value in newPostRequest(): " + flag); } initRequestCommon(req, referer, reqCfgC); if(flag != 3){ req.setHeader("Accept-Encoding", "gzip"); } return req; } //Get Gzip Content public String getGzipPostContent2(String URL,String referer,NameValuePair[] pairs,int flag,InputStream is,String body) throws Exception { String strRs = null; for(int k=1;k<=RetryNum;k++){ BufferedReader br=null; final HttpPost req = newPostRequest(URL, referer, pairs,flag,is,body, null); try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if(status == HttpStatus.SC_OK && entity != null){ final Header hr = resp.getFirstHeader("Content-Encoding"); final InputStream ins = (hr!=null && hr.getValue() != null) ? new GZIPInputStream(entity.getContent()) : entity.getContent(); br = new BufferedReader(new InputStreamReader(ins, utf8Charset)); StringBuilder resBuffer = new StringBuilder(); String resTemp = ""; long startTime = System.currentTimeMillis(); while((resTemp = br.readLine()) != null){ if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){ throw new SocketTimeoutException("exception occurs after fetching "+resBuffer.length()+"b data"); } resBuffer.append(resTemp); } strRs = resBuffer.toString(); } break; }catch( IOException ex ) { ex.printStackTrace(); }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } if(br!=null){ br.close(); br =null; } } } return strRs; } //Get Gzip Content public String getGzipPostContent(String URL,String referer,NameValuePair[] pairs) throws Exception{ String strRs = null; for(int k=1;k<=RetryNum;k++){ BufferedReader br =null; final HttpPost req = newPostRequest(URL, referer, pairs,1,null,null,null); try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if((status == HttpStatus.SC_OK || status == 302) && entity != null){ final Header hr = resp.getFirstHeader("Content-Encoding"); final InputStream ins = (hr!=null && hr.getValue() != null) ? new GZIPInputStream(entity.getContent()) : entity.getContent(); final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset; br = new BufferedReader(new InputStreamReader(ins, entityCharset)); StringBuilder resBuffer = new StringBuilder(); String resTemp = ""; long startTime = System.currentTimeMillis(); while((resTemp = br.readLine()) != null){ if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){ throw new SocketTimeoutException("exception occurs after fetching "+resBuffer.length()+"b data"); } resBuffer.append(resTemp); } strRs = resBuffer.toString(); } break; }catch( IOException ex ) { ex.printStackTrace(); }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } if(br!=null){ try { br.close(); } catch (Exception e) { }finally{ br = null; } } } } return strRs; } public byte[] getHttpBytes(String url,String referer) throws Exception{ final HttpGet req = newGetRequest(url,referer,null); byte[] data = null; contentFileType = null; try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { Header header = resp.getFirstHeader("Content-Type"); if(header!=null){ String headValue = header.getValue(); if(headValue.indexOf("image")!=-1){ contentFileType = headValue.replaceAll("image/", ""); } } final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if(status == HttpStatus.SC_OK && entity != null){ data = EntityUtils.toByteArray(entity); } }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } return data; } // if needContent is false, we only check that the content is available, but don't actually fetch it. Returns an empty string upon success. private static final Pattern charsetPat = Pattern.compile("charset=['\"]?(?<name>[-_A-Za-z0-9]+)['\"]?"); public int lastHttpStatus; public String lastRedirPath, lastRedirURL; public String lastResponseStr = null; public static interface ByteArrayContentHandler { public byte[] handleBytes(byte[] in); } public static enum WebContentCheckMode { NONE, NONEMPTY, NOT_HTML, FULL } public static class GetGzipContentOptions { public WebContentCheckMode defaultChkMode = WebContentCheckMode.FULL; // if not FULL, no content will be returned, but we will check whether the content is nonempty (NONEMPTY) or does not begin with <html (NOT_HTML), with "" upon success and null upon failure. public boolean allowRedirect = true; public ByteArrayContentHandler bytesHandler = null; // if non-null, the fetched bytes are passed through this filter public WebContentCheckMode getChkMode(String redirURL) { return defaultChkMode; } } public String getGzipContentAPI(String URL,String referer, GetGzipContentOptions opts) throws Exception{ lastHttpStatus = 0; lastRedirPath = null; lastRedirURL = null; String strRs = null; byte[] data; Charset httpCharset = null; for(int k=1;k<=RetryNum;k++){ final HttpGet req = newGetRequest(URL, referer, (reqCfgB) -> { reqCfgB.setRedirectsEnabled(opts.allowRedirect); // if opts.allowRedirect is true, we won't actually get 302's then }); InputStream ins = null; try( CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds) ) { lastRedirURL = null; int status = resp.getStatusLine().getStatusCode(); lastHttpStatus = status; // req.getRequestLine().getUri() or req.getURI() both gives the original, non-redirected URL. // See http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e334 for the proper approach final HttpHost targetHost = httpCtx.getTargetHost(); List<URI> redirLocs = httpCtx.getRedirectLocations(); URI redirUri = URIUtils.resolve(req.getURI(), targetHost, redirLocs); lastRedirPath = redirUri.getPath(); // this should be the path after following redirects lastRedirURL = redirUri.toString(); // should include the query string... if(URL.equalsIgnoreCase(lastRedirURL)){ redirectUrl = null; }else{ redirectUrl = lastRedirURL; } final HttpEntity entity = resp.getEntity(); if(! ((status == HttpStatus.SC_OK || (opts.allowRedirect && status == 302) || acceptedHttpStatuses.contains(status)) && entity != null)) { // if(URL==null){ // LOGGER.warn(status); return null; // } // LOGGER.warn("FetchWebData: " + URL + ": " + status); return null; } final ContentType httpContentType = ContentType.get(entity); // can be null httpCharset = (httpContentType != null) ? httpContentType.getCharset() : null; final InputStream rawIns = entity.getContent(); final Header hr = resp.getFirstHeader("Content-Encoding"); if(hr!=null){ // As of httpclient-4.5.2, decompression should already have been done by the httpclient library String encName = hr.getValue(); if (encName != null) { if (encName.equals("deflate")) ins = new InflaterInputStream(rawIns); // is this correct? else if (encName.equals("gzip")) ins = new GZIPInputStream(rawIns); else if (encName.toLowerCase().equals("utf-8")) {} // xiami's file servers return this // else LOGGER.warn("Unknown Content-Encoding: " + encName) ; } } lastModified = (null == resp.getFirstHeader("Last-Modified") ? null :resp.getFirstHeader("Last-Modified").getValue()); if(null != lastModified && lastModified.contains("GMT")){ try { SimpleDateFormat sdf = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss 'GMT'", Locale.US); sdf.setTimeZone(TimeZone.getTimeZone("GMT")); Date ftime = null; DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); ftime = sdf.parse(lastModified); lastModified = df.format(ftime); } catch (Exception e) { } } if (ins == null){ ins = rawIns; } ByteArrayBuffer buf = new ByteArrayBuffer(16384); //final InputStream finalIns = ins; /* FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<String>(){ @Override public String execute() throws Exception { return ""; } },maxReadMillSeconds); */ //future task starts int totalInsSize = 0;//一共接收了多少個size final WebContentCheckMode chkMode = opts.getChkMode(lastRedirURL); long startTime = System.currentTimeMillis(); if (chkMode != WebContentCheckMode.FULL) { if (chkMode == WebContentCheckMode.NONE) return ""; final int nbyte; // number of bytes to probe; NOTE: if this number of bytes is not available, probe fails if (chkMode == WebContentCheckMode.NOT_HTML) nbyte = 4; else nbyte = 1; final byte[] probeBuf = new byte[nbyte]; try { int nread = 0; while (nread < nbyte) { if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){ throw new SocketTimeoutException("exception occurs after fetching "+nread+"b data"); } int result = ins.read(probeBuf, nread, nbyte-nread); if (result <= 0) return null; nread += result; } if (chkMode == WebContentCheckMode.NOT_HTML && probeBuf[0] == '<' && probeBuf[1] == 'h' && probeBuf[2] == 't' && probeBuf[3] == 'm') return null; return ""; } finally { req.abort(); // NOTE: not really necessary; closing resp without closing ins would abort the connection in any case } } // Fetch the entire response body; must not use EntityUtils.toByteArray() due to possible compression startTime = System.currentTimeMillis(); try { // EOFException can occur when fetching badly compressed pages, e.g. http://www.hunantv.com/v/2/53080/f/678703.html byte[] inBuf = new byte[1024]; while (true) { if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){ throw new SocketTimeoutException("exception occurs after fetching "+totalInsSize+"b data"); } int curLen = ins.read(inBuf); if (curLen == -1) break; // EOF buf.append(inBuf, 0, curLen); totalInsSize+=curLen; } // if(!BusinessUtil.isExcludeUrl(URL)){ // if(totalInsSize >= maxPageSize){ // throw new PageBiggerThanMaxSizeException(totalInsSize); // } // } } catch (EOFException ex) { // LOGGER.error(URL + ": exception after reading " + buf.length() + " bytes: " + ex.toString()); }finally{ } //future task ends data = buf.toByteArray(); if (opts.bytesHandler != null) { data = opts.bytesHandler.handleBytes(data); if (data == null) return null; } if (this.forceCharset != null) { strRs = new String(data, this.forceCharset); } String rawStr = new String(data, iso8859Charset); Charset charset = null; Matcher m; if ((m = charsetPat.matcher(rawStr)).find()) { String name = m.group("name").toLowerCase(); if (name.startsWith("gb")) charset = gbkCharset; else if (name.equals("utf-8") || name.equals("utf8")){ charset = utf8Charset; }else{ if(name!=null && !name.trim().equals("")){ try { charset = Charset.forName(name); } catch (Exception e) { } } } } if (charset == null && httpCharset != null){ charset = httpCharset; } if (charset == null) { charset = utf8Charset; } boolean shouldCheckCode = false; if(this.forceCharset==null ){ shouldCheckCode = true; }else if(httpCharset!=null && httpCharset.compareTo(forceCharset) != 0){ shouldCheckCode = true; } else if(charset!=null && charset.compareTo(forceCharset) != 0){ shouldCheckCode = true; } if(shouldCheckCode){ if (httpCharset!=null && (strRs==null || (MyStringUtils.isMessyCode(strRs) && httpCharset.compareTo(forceCharset) != 0))){ strRs = new String(data,httpCharset); } if(strRs==null || MyStringUtils.isMessyCode(strRs)) { // guess from html content strRs = new String(data, charset); } } break; }catch(IOException ex ) { ex.printStackTrace(); }finally { if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } if(ins!=null) { try { ins.close(); } catch (Exception e) { }finally{ ins = null; } } // in case of an abortion, the stream may have been closed already } } lastResponseStr = strRs; return strRs; } private boolean getShouldRetry(int code) { Boolean shouldRetry_ = httpRetryModeMap.get(code); return (shouldRetry_ != null) ? shouldRetry_.booleanValue() : true; } public int getStatus(String URL,String referer) throws Exception{ final HttpGet req = newGetRequest(URL, null, null); try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); return status; }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } } public String getKXTPostContent(String url,String referer,String postContent) throws Exception{ final HttpPost req = newPostRequest(url, referer, null, 3, null,postContent, null); String strRs = null; try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if((status == HttpStatus.SC_OK || status == 302) && entity != null) { final InputStream rawIns = entity.getContent(); final InputStream ins; final int val = rawIns.read(); if (val == 0) ins = rawIns; // not compressed; used e.g. when there is no result else if (val == 1) ins = new GZIPInputStream(rawIns); else throw new RuntimeException("Unexpected KXT type byte: " + val); final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset; BufferedReader br = new BufferedReader(new InputStreamReader(ins, entityCharset)); StringBuilder resBuffer = new StringBuilder(); String resTemp = ""; while((resTemp = br.readLine()) != null){ resBuffer.append(resTemp); } br.close(); strRs = resBuffer.toString(); } }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } return strRs; } public String getPPTVPostContent(String url,String referer,String postContent) throws Exception{ final HttpPost req = newPostRequest(url, referer, null,3,null,postContent, null); String strRs = null; try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if((status == HttpStatus.SC_OK || status == 302) && entity != null) { final InputStream rawIns = entity.getContent(); final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset; BufferedReader br = new BufferedReader(new InputStreamReader(rawIns, entityCharset)); StringBuilder resBuffer = new StringBuilder(); String resTemp = ""; while((resTemp = br.readLine()) != null){ resBuffer.append(resTemp); } br.close(); strRs = resBuffer.toString(); } }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } return strRs; } //get gzip content public String getGzipContent(String URL,String referer) throws Exception{ String strRs = null; try { @SuppressWarnings("unused") URI uri = new URI(URL); } catch (URISyntaxException ex) { URL = URL.replace("|", "%7C"); } for(int k=1;k<=RetryNum;k++){ try { GetGzipContentOptions opts = new GetGzipContentOptions(); strRs = this.getGzipContentAPI(URL, referer, opts); if(strRs!=null){ break; // do not retry upon empty results, which can be valid } if (!getShouldRetry(lastHttpStatus)) { break; // the error is likely permanent } }catch( IOException ex ) { ex.printStackTrace(); } } return strRs; } // returns true if the content can be fetched properly public boolean checkGetContentMode(String URL, String referer, WebContentCheckMode chkMode, boolean allowRedirect) throws Exception { GetGzipContentOptions opts = new GetGzipContentOptions(); opts.defaultChkMode = chkMode; opts.allowRedirect = allowRedirect; for (int k = 1; k <= RetryNum; ++k) { if (getGzipContentAPI(URL, referer, opts) != null) { return true; } if (!getShouldRetry(lastHttpStatus)) { break; // the error is likely permanent } } return false; } public boolean checkGetContent(String URL, String referer, boolean allowRedirect) throws Exception{ return checkGetContentMode(URL, referer, WebContentCheckMode.NONEMPTY, allowRedirect); } //get page encoding public String getPageEncoding(String url) throws Exception{ String encoding = "gbk"; String content = this.getGzipContent(url, ""); if(content==null || content.equals("")) { return encoding; } //<meta http-equiv="Content-Type" content="text/html; charset=gb2312" /> String regX="(?is)<meta[^>]*?charset=[\"\']?([\\w-]+)[^>]*?/>"; Matcher m = Pattern.compile(regX).matcher(content); if(m.find()){ encoding = m.group(1); } if(encoding==null || encoding.equals("")) encoding="GBK"; return encoding; } //Get Location; returns null if this is not a redirect or if an error has occurred public String getLocation(String url) throws Exception { return getLocation(url, null); } public String getLocation(String url, String referer) throws Exception { this.lastHttpStatus = 0; String location = null; for(int k=1;k<=RetryNum;k++){ final HttpGet req = newGetRequest(url, referer, (reqCfgB) -> { reqCfgB.setRedirectsEnabled(false); }); try(CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int statusCode = resp.getStatusLine().getStatusCode(); this.lastHttpStatus = statusCode; //Location 301 or 302 if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){ //Get Location from header final Header locationHeader = resp.getFirstHeader("Location"); if (locationHeader != null) location = locationHeader.getValue(); } else if (statusCode == HttpStatus.SC_OK ){ }else if (statusCode >= 400){ return null; } // Would return null and abort the connection if we get 200 break; }catch( IOException ex ) { ex.printStackTrace(); }finally { if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } } return location; } //Get Post Location; returns null if this is not a redirect or if an error has occurred public String getPostLocation(String url,NameValuePair[] pairs) throws Exception{ this.lastHttpStatus = 0; String location = null; for(int k=1;k<=RetryNum;k++){ final HttpPost req = newPostRequest(url,this.refererUrl,pairs,1,null,null,null); try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int statusCode = resp.getStatusLine().getStatusCode(); this.lastHttpStatus = statusCode; if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){ //Get Location from header final Header locationHeader = resp.getFirstHeader("Location"); if (locationHeader != null) { location = locationHeader.getValue(); return location; } } else if (statusCode >= 400){ return null; } break; }catch( IOException ex ) { ex.printStackTrace(); }finally { if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } } return location; } public static Object bytesToObject(byte[] bytes) throws Exception{ Object result = null; ByteArrayInputStream byteInputStream = null; ObjectInputStream objectInputStream = null; try{ byteInputStream = new ByteArrayInputStream(bytes); objectInputStream = new ObjectInputStream(byteInputStream); result = objectInputStream.readObject(); }finally { if(null != objectInputStream){ try{ objectInputStream.close(); byteInputStream.close(); }catch(Exception e){ } } } return result; } public boolean downLoad(String remoteFileName, String localFileName) { final HttpGet req = newGetRequest(remoteFileName, null, null); try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( new FutureTaskHelper.Task<CloseableHttpResponse>(){ @Override public CloseableHttpResponse execute() throws Exception { return client.execute(req, httpCtx); } },maxReadMillSeconds)) { final int status = resp.getStatusLine().getStatusCode(); final HttpEntity entity = resp.getEntity(); if (status == HttpStatus.SC_OK && entity != null) { final Header tokenHeader = resp.getFirstHeader("token"); if (tokenHeader != null) { // LOGGER.info("The response value of token:" + tokenHeader.getValue()); } File storeFile = new File(localFileName); try (final FileOutputStream output = new FileOutputStream(storeFile)) { output.write(EntityUtils.toByteArray(entity)); // FIXME: won't work with very large files } return true; } else { // LOGGER.info("DownLoad file occurs exception, the error code is :" + status); return false; } } catch (Exception e) { // LOGGER.error(e.getMessage(),e); return false; }finally{ if(req!=null){ try { req.abort(); } catch (Exception e) { }finally{ } } } } public static void main(String[] args) throws Exception{ // FetchWebData fetch = new FetchWebData(); // fetch.setEncoding("GBK"); // //fetch.setStrCookie("SUV=1469117584605270; IPLOC=CN3100; ssuid=8162140256; CXID=93F24933B1CF48682C40DA22E37CB8A2;
[email protected]@@@@@@@@@; SUID=94F455655FC00D0A000000005790F48F; [email protected]@@@@@@@@@; ABTEST=0|1509435669|v1; weixinIndexVisited=1; JSESSIONID=aaaU39RywHKY27cilGv8v; PHPSESSID=5mje3e3ttdmhcg177csj7pp1h2; SUIR=E193958732376EE1C097CD153376B26F; SNUID=384B4F5DEAEFB7325DA2765EEAB9CF43; sct=30"); // //呼叫代理 // Proxy proxy = ProxyUtil.getProxyFromPool(); // fetch.enableProxy(proxy.proxyHost, proxy.proxyPort,proxy.username,proxy.password); // //// String content = fetch.getGzipContent("http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E6%89%8B%E6%9C%BA%E5%88%9B%E4%B8%9A&ie=utf8&_sug_=n&_sug_type_=",""); // System.exit(0); // //System.out.println(fetch.getRedirectUrl()); } }