1. 程式人生 > >Java網路爬蟲crawler4j學習筆記 網頁內容轉碼解析

Java網路爬蟲crawler4j學習筆記 網頁內容轉碼解析

簡介

網頁內容解析相關的類和介面位於包edu.uci.ics.crawler4j.parser中,用於拆分解析html網頁的各部分內容。下面的Parser的基本作用就是從各種各樣的資料(二進位制,文字)中抽取出我們需要的html頁面。

原始碼

ParseData介面

ParseData 介面包含getOutgoingUrls方法,用於獲取當前頁面的所有外鏈。

package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.url.WebURL;

import java.util.Set;

//抽象介面
public interface ParseData {

  //得到當前頁面的所有外鏈
  Set
<WebURL> getOutgoingUrls(); void setOutgoingUrls(Set<WebURL> outgoingUrls); @Override String toString(); }

HtmlParseData類

package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.url.WebURL;

import java.util.Map;
import java.util.Set;

// 用於處理html頁面的parse類
public class
HtmlParseData implements ParseData {
private String html; private String text; private String title; private Map<String, String> metaTags; private Set<WebURL> outgoingUrls; public String getHtml() { return html; } public void setHtml(String html) { this.html = html; } public
String getText() { return text; } public void setText(String text) { this.text = text; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public Map<String, String> getMetaTags() { return metaTags; } public void setMetaTags(Map<String, String> metaTags) { this.metaTags = metaTags; } @Override public Set<WebURL> getOutgoingUrls() { return outgoingUrls; } @Override public void setOutgoingUrls(Set<WebURL> outgoingUrls) { this.outgoingUrls = outgoingUrls; } @Override public String toString() { return text; } }

TextParseData類

package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.url.WebURL;

import java.util.HashSet;
import java.util.Set;

//對文字資料進行parse的類
public class TextParseData implements ParseData {

  private String textContent;
  private Set<WebURL> outgoingUrls = new HashSet<>();

  public String getTextContent() {
    return textContent;
  }

  public void setTextContent(String textContent) {
    this.textContent = textContent;
  }

  @Override
  public Set<WebURL> getOutgoingUrls() {
    return outgoingUrls;
  }

  @Override
  public void setOutgoingUrls(Set<WebURL> outgoingUrls) {
    this.outgoingUrls = outgoingUrls;
  }

  @Override
  public String toString() {
    return textContent;
  }
}

BinaryParseData類

package edu.uci.ics.crawler4j.parser;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import edu.uci.ics.crawler4j.url.WebURL;

// Tika是一個內容分析工具,自帶全面的parser工具類,
// 能解析基本所有常見格式的檔案,得到檔案的metadata,content等內容,返回格式化資訊
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

//二進位制檔案解析類
public class BinaryParseData implements ParseData {

  private static final Logger logger = LoggerFactory.getLogger(BinaryParseData.class);
  private static final String DEFAULT_ENCODING = "UTF-8";   // 預設編碼utf-8
  private static final String DEFAULT_OUTPUT_FORMAT = "html";   //預設輸出格式html

  // Creates an auto-detecting parser instance using the default Tika configuration.
  private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser();
  private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory) SAXTransformerFactory.newInstance();

  // Parse context. Used to pass context information to Tika parsers.
  private final ParseContext context = new ParseContext();
  // 頁面的所有外鏈
  private Set<WebURL> outgoingUrls = new HashSet<>();

  // 從二進位制資料中得到的html內容
  private String html = null;

  public BinaryParseData() {
    context.set(Parser.class, AUTO_DETECT_PARSER);
  }

  public void setBinaryContent(byte[] data) {
    InputStream inputStream = new ByteArrayInputStream(data);
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

    try {
      TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
      AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);

      // Hacking the following line to remove Tika's inserted DocType
      String htmlContent = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace("http://www.w3.org/1999/xhtml", "");
      setHtml(htmlContent);
    } catch (Exception e) {
      logger.error("Error parsing file", e);
    }
  }

  /**
   * Returns a transformer handler that serializes incoming SAX events to
   * XHTML or HTML (depending the given method) using the given output encoding.
   *
   * @param encoding output encoding, or <code>null</code> for the platform default
   */
  private static TransformerHandler getTransformerHandler(OutputStream out, String method, String encoding)
        throws TransformerConfigurationException {

    TransformerHandler transformerHandler = SAX_TRANSFORMER_FACTORY.newTransformerHandler();
    Transformer transformer = transformerHandler.getTransformer();
    transformer.setOutputProperty(OutputKeys.METHOD, method); // html
    //  the Transformer may add additional whitespace when outputting the result tree;
    transformer.setOutputProperty(OutputKeys.INDENT, "yes");  

    if (encoding != null) {
      transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
    }

    transformerHandler.setResult(new StreamResult(new PrintStream(out)));
    return transformerHandler;
  }

  /** @return Parsed binary content or null */
  public String getHtml() {
    return html;
  }

  public void setHtml(String html) {
    this.html = html;
  }

  @Override
  public Set<WebURL> getOutgoingUrls() {
    return outgoingUrls;
  }

  @Override
  public void setOutgoingUrls(Set<WebURL> outgoingUrls) {
    this.outgoingUrls = outgoingUrls;
  }

  @Override
  public String toString() {
    return (html == null || html.isEmpty()) ? "No data parsed yet" : getHtml();
  }
}