1. 程式人生 > 其它 ><經驗>使用Jsoup處理HTML的一個工具類

<經驗>使用Jsoup處理HTML的一個工具類

package cn.com.wind.utils;

import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

/**
 * @Author qymeng
 * @Date 2022/4/6
 * @Description
 */
@Slf4j
public class HtmlUtil {
    private final static List<String> controlTagList = Arrays.asList("th", "tr", "li", "p", "h1", "h2", "h3", "h4", "h5", "h6");

    
public static void main(String[] args) { File file = new File("D:\\qymeng\\SVN\\SVN-reverse\\dev\\src\\dataOriFile\\2020\\03\\03\\0400\\{316DE765-5CC5-11EA-A156-26D8B346C975}.html"); List<String> list = parseHtmlFile(file); // String html="<p style=\"text-align:justify;margin-bottom:0pt;margin-top:1pt;text-indent:5.24%;font-size:8pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;\">Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.&nbsp;&nbsp;&nbsp;&nbsp;<ix:nonnumeric id=\"F_000009\" name=\"dei:EntityWellKnownSeasonedIssuer\" contextref=\"C_0001674168_20190101_20191231\">Yes</ix:nonnumeric>&nbsp;&nbsp;<span style=\"font-family:MS Mincho;\">☒</span>&nbsp;&nbsp;&nbsp;No&nbsp;&nbsp;<span style=\"font-family:MS Mincho;\">☐</span>&nbsp;&nbsp;&nbsp;</p>";
// List<String> list = parseHtmlString(html); System.out.println(list.size()); for (String str : list) { System.out.println(str); } } /** * 解析html檔案 * @param file * @return */ public static List<String> parseHtmlFile(File file) { List
<String> resultList = new ArrayList<>(); Document doc = null; try { doc = Jsoup.parse(file, "utf-8"); } catch (IOException e) { e.printStackTrace(); } assert doc != null; List<String> list = deepTravalTag(doc.body()); if (list.size() != 0) { resultList.addAll(list); } removeSpaceFromList(resultList); return resultList; } /** * 解析byte陣列 * @param fileContent * @return */ public static List<String> parseHtmlBytes(byte[] fileContent){ InputStream inputStream = new ByteArrayInputStream(fileContent); List<String> resultList = new ArrayList<>(); Document doc = null; try { doc = Jsoup.parse(inputStream, "utf-8", ""); } catch (IOException e) { e.printStackTrace(); } List<String> list = deepTravalTag(doc.body()); if (list.size() != 0) { resultList.addAll(list); } removeSpaceFromList(resultList); return resultList; } /** * 解析html字串 * @param html * @return */ public static List<String> parseHtmlString(String html) { List<String> resultList = new ArrayList<>(); Document doc = Jsoup.parse(html, "utf-8"); List<String> list = deepTravalTag(doc.body()); if (list.size() != 0) { resultList.addAll(list); } removeSpaceFromList(resultList); return resultList; } /** * 深度遍歷找文字(遞迴) * * @param element * @return */ public static String getElementValue(Element element) { StringBuilder res = new StringBuilder(); if (element.childrenSize() == 0) { String childrenStr = (element.text().matches("\\s*")) ? "" : element.text().replaceAll("\u200B", ""); if (!childrenStr.matches("\\s*")) { return childrenStr; } } else { for (Node node : element.childNodes()) { res.append(getNodeValue(node, !element.tagName().toLowerCase(Locale.ROOT).equals("tr"))); } } return res.toString(); } /** * 深度遍歷結點(遞迴) * * @param node * @return */ public static String getNodeValue(Node node, boolean needBr) { StringBuilder result = new StringBuilder(); if (node.childNodes().size() == 0) { String text = ""; if (node.nodeName().equals("#text")) { text = ((TextNode) node).text(); } else if (node.nodeName().equals("br") && needBr) { text = "\r\n"; } else { text = ((Element) node).text(); } String childrenStr = (text.matches("\\s*")) ? "" : text.replaceAll("\u200B", ""); if (text.equals("\r\n")) { result.append("\r\n") ; } if (!childrenStr.matches("\\s*")) { return childrenStr; } } else { for (int i = 0; i < node.childNodes().size(); i++) { if (node.nodeName().toLowerCase(Locale.ROOT).equals("br") && needBr) { if (!result.toString().matches("\\s*")) { result.append("\r\n"); } } else { result.append(getNodeValue(node.childNode(i), needBr)); } } } return result.toString(); } /** * 深度遍歷找標籤 * * @param element * @return */ public static List<String> deepTravalTag(Element element) { List<String> resultList = new ArrayList<>(); if (element.childrenSize() == 0) { resultList.add(element.text()); return resultList; } if (element.attr("style") != null && element.attr("style").contains("display:none")) { return resultList; } int lastIndex = -1; for (int i = 0; i < element.childrenSize(); i++) { Element child = element.children().get(i); if (child.text() == null && child.text().matches("\\s*") && !child.nodeName().equals("br")) { log.info(child.nodeName() + "中文字為空"); continue; } if (child.attr("style") != null && child.attr("style").contains("display:none")) { continue; } //判斷標籤的子標籤包不包含分段標籤 boolean needFind = needFindChildren(child); //自己不是分段標籤,並且所有子標籤也不是 if (!controlTagList.contains(child.nodeName()) && !needFind) { //直接獲得文字 String eleStr = getElementValue(child); if (eleStr.matches("\\s*")) { continue; } if (lastIndex != -1) { String s = resultList.get(lastIndex) + eleStr; resultList.set(lastIndex, s); } else { resultList.add(eleStr); lastIndex = resultList.size() - 1; } } else if (controlTagList.contains(child.nodeName()) && !needFind) { //自己是分段標籤,子標籤不是 lastIndex = -1; String line = getElementValue(child); resultList.add(line); } else { lastIndex = -1; List<String> list = deepTravalTag(child); resultList.addAll(list); } } return resultList; } public static boolean needFindChildren(Element element) { if (element.nodeName().equals("tr") || element.nodeName().equals("li")) { return false; } boolean contains = false; for (String tag : controlTagList) { Elements children = element.children(); for (Element ele : children) { if (ele.getElementsByTag(tag).size() != 0) { contains = true; break; } } } return contains; } /** * 去空格 * * @param list */ public static void removeSpaceFromList(List<String> list) { list.removeIf(s -> s == null || s.isEmpty()); } }