Java網路爬蟲crawler4j學習筆記 PageFetchResult類
阿新 • • 發佈:2018-12-26
原始碼
package edu.uci.ics.crawler4j.fetcher;
import java.io.EOFException;
import java.io.IOException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.util.EntityUtils;
import edu.uci.ics.crawler4j.crawler.Page;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Yasser Ganjisaffar [lastname at gmail dot com]
*/
// 使用httpclient包fetch page之後儲存的結果
public class PageFetchResult {
protected static final Logger logger = LoggerFactory.getLogger(PageFetchResult.class);
protected int statusCode; // 狀態碼
protected HttpEntity entity = null; // httpEntity物件
protected Header[] responseHeaders = null; // 響應訊息頭
protected String fetchedUrl = null; // url連結
protected String movedToUrl = null;
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public HttpEntity getEntity () {
return entity;
}
public void setEntity(HttpEntity entity) {
this.entity = entity;
}
public Header[] getResponseHeaders() {
return responseHeaders;
}
public void setResponseHeaders(Header[] responseHeaders) {
this.responseHeaders = responseHeaders;
}
public String getFetchedUrl() {
return fetchedUrl;
}
public void setFetchedUrl(String fetchedUrl) {
this.fetchedUrl = fetchedUrl;
}
public boolean fetchContent(Page page) {
try {
// 將fetch後的結果解析轉換成page物件
page.load(entity);
page.setFetchResponseHeaders(responseHeaders);
return true;
} catch (Exception e) {
logger.info("Exception while fetching content for: {} [{}]", page.getWebURL().getURL(), e.getMessage());
}
return false;
}
// 忽略content,不進行處理
public void discardContentIfNotConsumed() {
try {
if (entity != null) {
EntityUtils.consume(entity);
}
} catch (IOException e) {
// We can EOFException (extends IOException) exception. It can happen on compressed streams which are not repeatable
// We can ignore this exception. It can happen if the stream is closed.
} catch (Exception e) {
logger.warn("Unexpected error occurred while trying to discard content", e);
}
}
public String getMovedToUrl() {
return movedToUrl;
}
public void setMovedToUrl(String movedToUrl) {
this.movedToUrl = movedToUrl;
}
}