Java網路爬蟲crawler4j學習筆記 Page 類
阿新 • • 發佈:2018-12-26
簡介
Page 類解析httpClient包中的Entity物件,獲取當前頁面的資訊,包括url(轉換為WebURl),response的資訊(status code, response header等),解析後的內容資訊等等。
原始碼
package edu.uci.ics.crawler4j.crawler;
import java.nio.charset.Charset;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.util.EntityUtils;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* This class contains the data for a fetched and parsed page.
*
* @author Yasser Ganjisaffar [lastname at gmail dot com]
*/
// 用來描述web頁面的類
public class Page {
/**
* The URL of this page.
*/
// 當前頁面的url
protected WebURL url;
/**
* Redirection flag
*/
// 當前頁面是否重定向
protected boolean redirect;
/**
* The URL to which this page will be redirected to
*/
// 重定向的url
protected String redirectedToUrl;
/**
* Status of the page
*/
// 當前頁面的狀態碼
protected int statusCode;
/**
* The content of this page in binary format.
*/
// 二進位制格式的頁面內容
protected byte[] contentData;
/**
* The ContentType of this page.
* For example: "text/html; charset=UTF-8"
*/
// 當前頁面的contentType
protected String contentType;
/**
* The encoding of the content.
* For example: "gzip"
*/
// 當前頁面的編碼方式
protected String contentEncoding;
/**
* The charset of the content.
* For example: "UTF-8"
*/
// 頁面內容的字符集
protected String contentCharset;
/**
* Language of the Content.
*/
// 頁面內容的language
private String language;
/**
* Headers which were present in the response of the fetch request
*/
// 當前頁面response中的header集合
protected Header[] fetchResponseHeaders;
/**
* The parsed data populated by parsers
*/
// 使用parser翻譯過後的頁面
protected ParseData parseData;
public Page(WebURL url) {
this.url = url;
}
/**
* Loads the content of this page from a fetched HttpEntity.
*
* @param entity HttpEntity
* @throws Exception when load fails
*/
// 解析通過httpclient包收到的entity
public void load(HttpEntity entity) throws Exception {
contentType = null;
Header type = entity.getContentType();
if (type != null) {
contentType = type.getValue();
}
contentEncoding = null;
Header encoding = entity.getContentEncoding();
if (encoding != null) {
contentEncoding = encoding.getValue();
}
Charset charset = ContentType.getOrDefault(entity).getCharset();
if (charset != null) {
contentCharset = charset.displayName();
}
contentData = EntityUtils.toByteArray(entity);
}
public WebURL getWebURL() {
return url;
}
public void setWebURL(WebURL url) {
this.url = url;
}
public boolean isRedirect() {
return redirect;
}
public void setRedirect(boolean redirect) {
this.redirect = redirect;
}
public String getRedirectedToUrl() {
return redirectedToUrl;
}
public void setRedirectedToUrl(String redirectedToUrl) {
this.redirectedToUrl = redirectedToUrl;
}
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
/**
* Returns headers which were present in the response of the fetch request
*
* @return Header Array, the response headers
*/
public Header[] getFetchResponseHeaders() {
return fetchResponseHeaders;
}
public void setFetchResponseHeaders(Header[] headers) {
fetchResponseHeaders = headers;
}
/**
* @return parsed data generated for this page by parsers
*/
public ParseData getParseData() {
return parseData;
}
public void setParseData(ParseData parseData) {
this.parseData = parseData;
}
/**
* @return content of this page in binary format.
*/
public byte[] getContentData() {
return contentData;
}
public void setContentData(byte[] contentData) {
this.contentData = contentData;
}
/**
* @return ContentType of this page.
* For example: "text/html; charset=UTF-8"
*/
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
/**
* @return encoding of the content.
* For example: "gzip"
*/
public String getContentEncoding() {
return contentEncoding;
}
public void setContentEncoding(String contentEncoding) {
this.contentEncoding = contentEncoding;
}
/**
* @return charset of the content.
* For example: "UTF-8"
*/
public String getContentCharset() {
return contentCharset;
}
public void setContentCharset(String contentCharset) {
this.contentCharset = contentCharset;
}
/**
* @return Language
*/
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
}