網站白名單過濾Java程式碼
阿新 • • 發佈:2019-02-16
文字編輯器儲存資料時,出於安全考慮,需要過濾掉一些標記,如超連結、js、iframe等等。
package com.test; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author WangShanlin * @version Sep 26, 2011 5:52:02 PM */ public class WebWhiteNameFilter { /** regex flag union representing /si modifiers in PHP **/ private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->",Pattern.DOTALL); private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$",REGEX_FLAGS_SI); private static final Pattern P_TAGS = Pattern.compile("<(.*?)>",Pattern.DOTALL); private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)",REGEX_FLAGS_SI); private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI); private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI); private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI); private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):",REGEX_FLAGS_SI); private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?"); private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?"); private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?"); private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))"); private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL); private static final Pattern P_END_ARROW = Pattern.compile("^>"); private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)"); private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)"); private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)"); private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)"); private static final Pattern P_AMP = Pattern.compile("&"); private static final Pattern P_QUOTE = Pattern.compile("\""); private static final Pattern P_LEFT_ARROW = Pattern.compile("<"); private static final Pattern P_RIGHT_ARROW = Pattern.compile(">"); private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>"); // @xxx could grow large... maybe use sesat's ReferenceMap private static final ConcurrentMap<String, Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>(); private static final ConcurrentMap<String, Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>(); /** * set of allowed html elements, along with allowed attributes for each * element **/ private final Map<String, List<Attribute>> vAllowed; /** counts of open tags for each (allowable) html element **/ private final Map<String, Integer> vTagCounts = new HashMap<String, Integer>(); /** html elements which must always be self-closing (e.g. "<img />") **/ private final String[] vSelfClosingTags; /** * html elements which must always have separate opening and closing tags * (e.g. "<b></b>") **/ private final String[] vNeedClosingTags; /** set of disallowed html elements **/ private final String[] vDisallowed; /** attributes which should be checked for valid protocols **/ private final String[] vProtocolAtts; /** allowed protocols **/ private final String[] vAllowedProtocols; /** * tags which should be removed if they contain no content (e.g. "<b></b>" * or "<b />") **/ private final String[] vRemoveBlanks; /** entities allowed within html markup **/ private final String[] vAllowedEntities; /** flag determining whether comments are allowed in input String. */ private final boolean stripComment; private boolean vDebug = false; /** * * flag determining whether to try to make tags when presented with * "unbalanced" * * angle brackets (e.g. "<b text </b>" becomes "<b> text </b>"). If set to * false, * * unbalanced angle brackets will be html escaped. */ private final boolean alwaysMakeTags; private ArrayList<Attribute> empty_atts = new ArrayList<Attribute>(); private static WebWhiteNameFilter filter = new WebWhiteNameFilter(); // for complex attributes like style="color:red;font-style:italic" private class Attribute { String attrName; Map<String, Pattern> allowedAttrValues; public Attribute(String attrName) { this.attrName = attrName; } public Attribute(String attrName, Map<String, Pattern> map) { this.attrName = attrName; allowedAttrValues = map; } } public static String filter(String input){ return filter.dofilter(input); } /** * Default constructor. */ public WebWhiteNameFilter() { vAllowed = new HashMap<String, List<Attribute>>(); { /* * * font : ['color', 'size', 'face', '.background-color'] */ final ArrayList<Attribute> font_atts = new ArrayList<Attribute>(); Map<String, Pattern> font_style = new HashMap<String, Pattern>(); font_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); font_atts.add(new Attribute("style", font_style)); font_atts.add(new Attribute("color")); font_atts.add(new Attribute("size")); font_atts.add(new Attribute("face")); vAllowed.put("font", font_atts); } /* * * span : ['.color', '.background-color', '.font-size', '.font-family', * '.background','.font-weight', '.font-style', * * '.text-decoration', '.vertical-align'], */ { // <span // style='font-size:24px;color:#60d978;font-family:KaiTi_GB2312;'>您哈</span> final ArrayList<Attribute> span_atts = new ArrayList<Attribute>(); Map<String, Pattern> span_style = new HashMap<String, Pattern>(); span_style.put("color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); span_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); // ['9px', '10px', '12px', '14px', '16px', '18px', '24px', '32px'] span_style.put("font-size", Pattern.compile("([\\s\\S]*)")); span_style.put("font-family", Pattern.compile("([\\s\\S]*)")); span_style.put("background",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); span_style.put("font-weight",Pattern.compile("(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)")); span_style.put("font-style",Pattern.compile("(normal|italic|oblique)")); span_style.put("text-decoration", Pattern.compile("(none|underline|overline|line-through|blink)")); span_style.put("vertical-align",Pattern.compile("(auto|baseline|sub|super|top|middle|bottom|text-top|text-bottom)")); span_atts.add(new Attribute("style", span_style)); vAllowed.put("span", span_atts); } /* * * div : ['align', '.border', '.margin', '.padding', '.text-align', * '.color','.background-color', * * '.font-size', '.font-family', '.font-weight', * '.background','.font-style', '.text-decoration', * * '.vertical-align', '.margin-left'], */ { final ArrayList<Attribute> div_atts = new ArrayList<Attribute>(); Map<String, Pattern> div_style = new HashMap<String, Pattern>(); div_style.put("border", Pattern.compile("([\\s\\S]*)")); div_style.put("margin", Pattern.compile("(top|right|bottom|left)")); div_style.put("padding", Pattern.compile("([\\s\\S]*)")); div_style.put("text-align",Pattern.compile("(left|right|center|justify)")); div_style.put("color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); div_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); div_style.put("font-size", Pattern.compile("([\\s\\S]*)")); div_style.put("font-family", Pattern.compile("([\\s\\S]*)")); div_style.put("font-weight",Pattern.compile("(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)")); div_style.put("background", Pattern.compile("([\\s\\S]*)")); div_style.put("font-style",Pattern.compile("(normal|italic|oblique)")); div_style.put("text-decoration", Pattern.compile("(none|underline|overline|line-through|blink)")); div_style.put("vertical-align",Pattern.compile("(auto|baseline|sub|super|top|middle|bottom|text-top|text-bottom)")); div_style.put("margin-left", Pattern.compile("([\\s\\S]*)")); div_atts.add(new Attribute("style", div_style)); div_atts.add(new Attribute("align")); vAllowed.put("div", div_atts); } /* * * table: [ * * 'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align', * 'bordercolor','.padding', '.margin', '.border', * * 'bgcolor', '.text-align', '.color', '.background-color', * * '.font-size', '.font-family', '.font-weight', '.font-style', * '.text-decoration', '.background','.width', '.height' * * ], */ { final ArrayList<Attribute> table_atts = new ArrayList<Attribute>(); Map<String, Pattern> table_style = new HashMap<String, Pattern>(); table_style.put("padding", Pattern.compile("([\\s\\S]*)")); table_style.put("margin",Pattern.compile("(top|right|bottom|left)")); table_style.put("border", Pattern.compile("[0-2]+px")); table_style.put("bgcolor",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); table_style.put("text-align",Pattern.compile("(left|right|center|justify)")); table_style.put("color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); table_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); table_style.put("font-size", Pattern.compile("([\\s\\S]*)")); table_style.put("font-family", Pattern.compile("([\\s\\S]*)")); table_style.put("font-weight",Pattern.compile("(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)")); table_style.put("font-style",Pattern.compile("(normal|italic|oblique)")); table_style.put("text-decoration", Pattern.compile("(none|underline|overline|line-through|blink)")); table_style.put("background", Pattern.compile("([\\s\\S]*)")); table_style.put("width", Pattern.compile("([\\s\\S]*)")); table_style.put("height", Pattern.compile("([\\s\\S]*)")); table_atts.add(new Attribute("style", table_style)); table_atts.add(new Attribute("border")); table_atts.add(new Attribute("cellspacing")); table_atts.add(new Attribute("cellpadding")); table_atts.add(new Attribute("width")); table_atts.add(new Attribute("height")); table_atts.add(new Attribute("align")); table_atts.add(new Attribute("bordercolor")); vAllowed.put("table", table_atts); } /* * * 'td,th': [ * * 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', * 'bgcolor', * * '.text-align', '.color', '.background-color', '.font-size', * '.font-family', '.font-weight', * * '.font-style', '.text-decoration', '.vertical-align', '.background' * * ], */ { final ArrayList<Attribute> td_atts = new ArrayList<Attribute>(); Map<String, Pattern> td_style = new HashMap<String, Pattern>(); td_style.put("text-align",Pattern.compile("(left|right|center|justify)")); td_style.put("color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); td_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); td_style.put("font-size", Pattern.compile("([\\s\\S]*)")); td_style.put("font-family", Pattern.compile("([\\s\\S]*)")); td_style.put("font-weight",Pattern.compile("(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)")); td_style.put("font-style",Pattern.compile("(normal|italic|oblique)")); td_style.put("text-decoration", Pattern.compile("(none|underline|overline|line-through|blink)")); td_style.put("vertical-align",Pattern.compile("(auto|baseline|sub|super|top|middle|bottom|text-top|text-bottom)")); td_style.put("background", Pattern.compile("([\\s\\S]*)")); td_atts.add(new Attribute("style", td_style)); td_atts.add(new Attribute("align")); td_atts.add(new Attribute("valign")); td_atts.add(new Attribute("width")); td_atts.add(new Attribute("height")); td_atts.add(new Attribute("colspan")); td_atts.add(new Attribute("rowspan")); td_atts.add(new Attribute("bgcolor")); vAllowed.put("td", td_atts); } { final ArrayList<Attribute> th_atts = new ArrayList<Attribute>(); Map<String, Pattern> th_style = new HashMap<String, Pattern>(); th_style.put("text-align",Pattern.compile("(left|right|center|justify)")); th_style.put("color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); th_style.put("background-color",Pattern.compile("(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))")); th_style.put("font-size", Pattern.compile("([\\s\\S]*)")); th_style.put("font-family", Pattern.compile("([\\s\\S]*)")); th_style.put("font-weight",Pattern.compile("(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)")); th_style.put("font-style",Pattern.compile("(normal|italic|oblique)")); th_style.put("text-decoration", Pattern.compile("(none|underline|overline|line-through|blink)")); th_style.put("vertical-align",Pattern.compile("(auto|baseline|sub|super|top|middle|bottom|text-top|text-bottom)")); th_style.put("background", Pattern.compile("([\\s\\S]*)")); th_atts.add(new Attribute("style", th_style)); th_atts.add(new Attribute("align")); th_atts.add(new Attribute("valign")); th_atts.add(new Attribute("width")); th_atts.add(new Attribute("height")); th_atts.add(new Attribute("colspan")); th_atts.add(new Attribute("rowspan")); th_atts.add(new Attribute("bgcolor")); vAllowed.put("th", th_atts); } /* * * a : ['href', 'target', 'name'], */ { final ArrayList<Attribute> a_atts = new ArrayList<Attribute>(); a_atts.add(new Attribute("href")); a_atts.add(new Attribute("target")); a_atts.add(new Attribute("name")); vAllowed.put("a", a_atts); } /* * * embed : ['src', 'width', 'height', 'type', 'loop', 'autostart', * 'quality', '.width', '.height', 'align', 'allowscriptaccess', '/'], */ { final ArrayList<Attribute> embed_atts = new ArrayList<Attribute>(); Map<String, Pattern> embed_style = new HashMap<String, Pattern>(); embed_style.put("width", Pattern.compile("([\\s\\S]*)")); embed_style.put("height", Pattern.compile("([\\s\\S]*)")); embed_atts.add(new Attribute("style", embed_style)); embed_atts.add(new Attribute("src")); embed_atts.add(new Attribute("width")); embed_atts.add(new Attribute("height")); embed_atts.add(new Attribute("type")); embed_atts.add(new Attribute("loop")); embed_atts.add(new Attribute("autostart")); embed_atts.add(new Attribute("quality")); embed_atts.add(new Attribute("align")); embed_atts.add(new Attribute("allowscriptaccess")); vAllowed.put("embed", embed_atts); } /* * * img : ['src', 'width', 'height', 'border', 'alt', 'title', '.width', * '.height', '/'], */ { final ArrayList<Attribute> img_atts = new ArrayList<Attribute>(); Map<String, Pattern> img_style = new HashMap<String, Pattern>(); img_style.put("width", Pattern.compile("([\\s\\S]*)")); img_style.put("height", Pattern.compile("([\\s\\S]*)")); img_atts.add(new Attribute("style", img_style)); img_atts.add(new Attribute("src")); img_atts.add(new Attribute("width")); img_atts.add(new Attribute("height")); img_atts.add(new Attribute("border")); img_atts.add(new Attribute("alt")); img_atts.add(new Attribute("title")); vAllowed.put("img", img_atts); } /* * * hr : ['/'], * * br : ['/'], */ /* * 'p,ol,ul,li,blockquote,h1,h2,h3,h4,h5,h6' : [ * * 'align', '.text-align', '.color', '.background-color', '.font-size', * '.font-family', '.background', * * '.font-weight', '.font-style', '.text-decoration', '.vertical-align', * '.text-indent', '.margin-left' * ], */ // p { final ArrayList<Attribute> p_atts = new ArrayList<Attribute>(); /* * 不要刪 * Map<String, Pattern> p_style = new HashMap<String, Pattern>(); * p_style.put("text-align", Pattern.compile("(#())")); * p_style.put("color", Pattern.compile("(#())")); * p_style.put("background-color", Pattern.compile("(#())")); * p_style.put("font-size", Pattern.compile("(#())")); * p_style.put("font-family", Pattern.compile("(#())")); * p_style.put("background", Pattern.compile("(#())")); * p_style.put("font-weight", Pattern.compile("(#())")); * p_style.put("font-style", Pattern.compile("(#())")); * p_style.put("text-decoration", Pattern.compile("(#())")); * p_style.put("vertical-align", Pattern.compile("(#())")); * p_style.put("text-indent", Pattern.compile("(#())")); * p_style.put("margin-left", Pattern.compile("(#())")); * p_atts.add(new Attribute("style", p_style)); */ p_atts.add(new Attribute("align")); vAllowed.put("p", p_atts); } // ol { final ArrayList<Attribute> ol_atts = new ArrayList<Attribute>(); ol_atts.add(new Attribute("align")); vAllowed.put("ol", ol_atts); } // ul { final ArrayList<Attribute> ul_atts = new ArrayList<Attribute>(); ul_atts.add(new Attribute("align")); vAllowed.put("ul", ul_atts); } // li { final ArrayList<Attribute> li_atts = new ArrayList<Attribute>(); li_atts.add(new Attribute("align")); vAllowed.put("li", li_atts); } // blockquote { final ArrayList<Attribute> blockquote_atts = new ArrayList<Attribute>(); blockquote_atts.add(new Attribute("align")); vAllowed.put("blockquote", blockquote_atts); } // h1 { final ArrayList<Attribute> h1_atts = new ArrayList<Attribute>(); h1_atts.add(new Attribute("align")); vAllowed.put("h1", h1_atts); } // h2 { final ArrayList<Attribute> h2_atts = new ArrayList<Attribute>(); h2_atts.add(new Attribute("align")); vAllowed.put("h2", h2_atts); } // h3 { final ArrayList<Attribute> h3_atts = new ArrayList<Attribute>(); h3_atts.add(new Attribute("align")); vAllowed.put("h3", h3_atts); } // h4 { final ArrayList<Attribute> h4_atts = new ArrayList<Attribute>(); h4_atts.add(new Attribute("align")); vAllowed.put("h4", h4_atts); } // h5 { final ArrayList<Attribute> h5_atts = new ArrayList<Attribute>(); h5_atts.add(new Attribute("align")); vAllowed.put("h5", h5_atts); } // h6 { final ArrayList<Attribute> h6_atts = new ArrayList<Attribute>(); h6_atts.add(new Attribute("align")); vAllowed.put("h6", h6_atts); } // h7 { final ArrayList<Attribute> h7_atts = new ArrayList<Attribute>(); h7_atts.add(new Attribute("align")); vAllowed.put("h7", h7_atts); } /* * * 'tbody,tr,strong,b,sub,sup,em,i,u,strike' : [] */ /* * * //b u i li link br hr p pre strong strike ul label * vAllowed.put("b", empty_atts); * vAllowed.put("p", empty_atts); * vAllowed.put("strong", empty_atts); * vAllowed.put("i", empty_atts); * vAllowed.put("em", empty_atts); * vAllowed.put("u", empty_atts); //增加了"u" */ vAllowed.put("tbody", empty_atts); vAllowed.put("tr", empty_atts); vAllowed.put("strong", empty_atts); vAllowed.put("b", empty_atts); vAllowed.put("sub", empty_atts); vAllowed.put("em", empty_atts); vAllowed.put("i", empty_atts); vAllowed.put("u", empty_atts); vAllowed.put("strike", empty_atts); /* * * inlineTags : ['b', 'del', 'em', 'font', 'i', 'span', 'strike', * 'strong', 'sub', 'sup', 'u'], * * endlineTags : [ * * 'br', 'hr', 'table', 'tbody', 'td', 'tr', 'th', 'div', 'p', 'ol', * 'ul', * * 'li', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script', * 'style', 'marquee' * * ], */ /* * * noEndTags : ['br', 'hr', 'img', 'area', 'col', 'embed', 'input', * 'param'], */ vSelfClosingTags = new String[] { "img", "br", "", "hr", "area", "col", "embed", "input", "param" }; /* * * inlineTags : ['b', 'del', 'em', 'font', 'i', 'span', 'strike', * 'strong', 'sub', 'sup', 'u'], * * endlineTags : [ * * 'br', 'hr', 'table', 'tbody', 'td', 'tr', 'th', 'div', 'p', 'ol', * 'ul', * * 'li', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script', * 'style', 'marquee' * * ], */ vNeedClosingTags = new String[] { "b", "del", "em", "font", "i", "span", "strike", "strong", "sub", "sup", "u", "table", "tbody", "td", "tr", "th", "div", "p", "ol", "ul", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "marquee" }; vDisallowed = new String[] {}; vAllowedProtocols = new String[] { "http", "mailto" }; // no ftp. vProtocolAtts = new String[] { "src", "href" }; vRemoveBlanks = new String[] { "a", "b", "strong", "i", "em", "u" }; vAllowedEntities = new String[] { "amp", "gt", "lt", "quot", "nbsp", "rsquo" }; // 增加了"nbsp" ,rsquo="\'" stripComment = true; alwaysMakeTags = true; } /** * Set debug flag to true. Otherwise use default settings. See the default * constructor. * * * * @param debug * turn debug on with a true argument */ public WebWhiteNameFilter(final boolean debug) { this(); vDebug = debug; } /** * Map-parameter configurable constructor. * * * * @param configuration * map containing configuration. keys match field names. */ // public HTMLFilter(final Map<String,Object> configuration) { // // assert configuration.containsKey("vAllowed") : // "configuration requires vAllowed"; // assert configuration.containsKey("vSelfClosingTags") : // "configuration requires vSelfClosingTags"; // assert configuration.containsKey("vNeedClosingTags") : // "configuration requires vNeedClosingTags"; // assert configuration.containsKey("vDisallowed") : // "configuration requires vDisallowed"; // assert configuration.containsKey("vAllowedProtocols") : // "configuration requires vAllowedProtocols"; // assert configuration.containsKey("vProtocolAtts") : // "configuration requires vProtocolAtts"; // assert configuration.containsKey("vRemoveBlanks") : // "configuration requires vRemoveBlanks"; // assert configuration.containsKey("vAllowedEntities") : // "configuration requires vAllowedEntities"; // assert configuration.containsKey("stripComment") : // "configuration requires stripComment"; // assert configuration.containsKey("alwaysMakeTags") : // "configuration requires alwaysMakeTags"; // // vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) // configuration.get("vAllowed")); // vSelfClosingTags = (String[]) configuration.get("vSelfClosingTags"); // vNeedClosingTags = (String[]) configuration.get("vNeedClosingTags"); // vDisallowed = (String[]) configuration.get("vDisallowed"); // vAllowedProtocols = (String[]) configuration.get("vAllowedProtocols"); // vProtocolAtts = (String[]) configuration.get("vProtocolAtts"); // vRemoveBlanks = (String[]) configuration.get("vRemoveBlanks"); // vAllowedEntities = (String[]) configuration.get("vAllowedEntities"); // stripComment = (Boolean) configuration.get("stripComment"); // alwaysMakeTags = (Boolean) configuration.get("alwaysMakeTags"); // } private void reset() { vTagCounts.clear(); } private void debug(final String msg) { if (vDebug) { Logger.getAnonymousLogger().info(msg); } } // --------------------------------------------------------------- // my versions of some PHP library functions public static String chr(final int decimal) { return String.valueOf((char) decimal); } public static String htmlSpecialChars(final String s) { String result = s; result = regexReplace(P_AMP, "&", result); result = regexReplace(P_QUOTE, "", result); result = regexReplace(P_LEFT_ARROW, "<", result); result = regexReplace(P_RIGHT_ARROW, ">", result); return result; } // --------------------------------------------------------------- /** * * given a user submitted input String, filter out any invalid or restricted * * html. * * * * @param input * text (i.e. submitted by a user) than may contain html * * @return "clean" version of input, with only valid, whitelisted html * elements allowed */ private String dofilter(final String input) { reset(); String s = input; debug("************************************************"); debug(" INPUT: " + input); s = escapeComments(s); debug(" escapeComments: " + s); s = balanceHTML(s); debug(" balanceHTML: " + s); s = checkTags(s); debug(" checkTags: " + s); s = processRemoveBlanks(s); debug("processRemoveBlanks: " + s); s = validateEntities(s); debug(" validateEntites: " + s); debug("************************************************\n\n"); return s; } public boolean isAlwaysMakeTags() { return alwaysMakeTags; } public boolean isStripComments() { return stripComment; } private String escapeComments(final String s) { final Matcher m = P_COMMENTS.matcher(s); final StringBuffer buf = new StringBuffer(); if (m.find()) { final String match = m.group(1); // (.*?) m.appendReplacement(buf, "<!--" + htmlSpecialChars(match) + "-->"); } m.appendTail(buf); return buf.toString(); } private String balanceHTML(String s) { if (alwaysMakeTags) { // try and form html s = regexReplace(P_END_ARROW, "", s); s = regexReplace(P_BODY_TO_END, "<$1>", s); s = regexReplace(P_XML_CONTENT, "$1<$2", s); } else { // escape stray brackets s = regexReplace(P_STRAY_LEFT_ARROW, "<$1", s); s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2><", s); // the last regexp causes '<>' entities to appear // (we need to do a lookahead assertion so that the last bracket can // be used in the next pass of the regexp) s = regexReplace(P_BOTH_ARROWS, "", s); } return s; } private String checkTags(String s) { Matcher m = P_TAGS.matcher(s); final StringBuffer buf = new StringBuffer(); while (m.find()) { String replaceStr = m.group(1); replaceStr = processTag(replaceStr); m.appendReplacement(buf, replaceStr); } m.appendTail(buf); s = buf.toString(); // these get tallied in processTag // (remember to reset before subsequent calls to filter method) for (String key : vTagCounts.keySet()) { for (int ii = 0; ii < vTagCounts.get(key); ii++) { s += "</" + key + ">"; } } return s; } private String processRemoveBlanks(final String s) { String result = s; for (String tag : vRemoveBlanks) { if (!P_REMOVE_PAIR_BLANKS.containsKey(tag)) { P_REMOVE_PAIR_BLANKS.putIfAbsent(tag,Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag+ ">")); } result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result); if (!P_REMOVE_SELF_BLANKS.containsKey(tag)) { P_REMOVE_SELF_BLANKS.putIfAbsent(tag,Pattern.compile("<" + tag + "(\\s[^>]*)?/>")); } result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result); } return result; } private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s) { Matcher m = regex_pattern.matcher(s); return m.replaceAll(replacement); } private String processTag(final String s) { // ending tags Matcher m = P_END_TAG.matcher(s); if (m.find()) { final String name = m.group(1).toLowerCase(); if (allowed(name)) { if (!inArray(name, vSelfClosingTags)) { if (vTagCounts.containsKey(name)) { vTagCounts.put(name, vTagCounts.get(name) - 1); return "</" + name + ">"; } } } } // starting tags m = P_START_TAG.matcher(s); if (m.find()) { final String name = m.group(1).toLowerCase(); final String body = m.group(2); String ending = m.group(3); // debug( "in a starting tag, name='" + name + "'; body='" + body + // "'; ending='" + ending + "'" ); if (allowed(name)) { String params = ""; final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body); final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body); final List<String> paramNames = new ArrayList<String>(); final List<String> paramValues = new ArrayList<String>(); while (m2.find()) { paramNames.add(m2.group(1)); // ([a-z0-9]+) paramValues.add(m2.group(3)); // (.*?) } while (m3.find()) { paramNames.add(m3.group(1)); // ([a-z0-9]+) paramValues.add(m3.group(3)); // ([^\"\\s']+) } String paramName, paramValue; for (int ii = 0; ii < paramNames.size(); ii++) { paramName = paramNames.get(ii).toLowerCase(); paramValue = paramValues.get(ii); if (allowedAttribute(name, paramName, paramValue)) { paramValue = allMapString(name, paramName, paramValue);// if (inArray(paramName, vProtocolAtts)) { paramValue = processParamProtocol(paramValue); } params += " " + paramName + "=\"" + paramValue + "\""; } } if (inArray(name, vSelfClosingTags)) { ending = " /"; } if (inArray(name, vNeedClosingTags)) { ending = ""; } if (ending == null || ending.length() < 1) { if (vTagCounts.containsKey(name)) { vTagCounts.put(name, vTagCounts.get(name) + 1); } else { vTagCounts.put(name, 1); } } else { ending = " /"; } return "<" + name + params + ending + ">"; } else { return ""; } } // comments m = P_COMMENT.matcher(s); if (!stripComment && m.find()) { return "<" + m.group() + ">"; } return ""; } private String processParamProtocol(String s) { s = decodeEntities(s); final Matcher m = P_PROTOCOL.matcher(s); if (m.find()) { final String protocol = m.group(1); if (!inArray(protocol, vAllowedProtocols)) { // bad protocol, turn into local anchor link instead s = "#" + s.substring(protocol.length() + 1, s.length()); if (s.startsWith("#//")) { s = "#" + s.substring(3, s.length()); } } } return s; } private String decodeEntities(String s) { StringBuffer buf = new StringBuffer(); Matcher m = P_ENTITY.matcher(s); while (m.find()) { final String match = m.group(1); final int decimal = Integer.decode(match).intValue(); m.appendReplacement(buf, chr(decimal)); } m.appendTail(buf); s = buf.toString(); buf = new StringBuffer(); m = P_ENTITY_UNICODE.matcher(s); while (m.find()) { final String match = m.group(1); final int decimal = Integer.valueOf(match, 16).intValue(); m.appendReplacement(buf, chr(decimal)); } m.appendTail(buf); s = buf.toString(); buf = new StringBuffer(); m = P_ENCODE.matcher(s); while (m.find()) { final String match = m.group(1); final int decimal = Integer.valueOf(match, 16).intValue(); m.appendReplacement(buf, chr(decimal)); } m.appendTail(buf); s = buf.toString(); s = validateEntities(s); return s; } private String validateEntities(String s) { StringBuffer buf = new StringBuffer(); // validate entities throughout the string Matcher m = P_VALID_ENTITIES.matcher(s); while (m.find()) { final String one = m.group(1); // ([^&;]*) final String two = m.group(2); // (?=(;|&|$)) final String replacement = Matcher.quoteReplacement(checkEntity( one, two)); m.appendReplacement(buf, replacement); } m.appendTail(buf); s = buf.toString(); // validate quotes outside of tags buf = new StringBuffer(); m = P_VALID_QUOTES.matcher(s); while (m.find()) { final String one = m.group(1); // (>|^) final String two = m.group(2); // ([^<]+?) final String three = m.group(3); // (<|$) m.appendReplacement(buf,Matcher.quoteReplacement(one+ regexReplace(P_QUOTE, "", two) + three)); } m.appendTail(buf); s = buf.toString(); return s; } private String checkEntity(final String preamble, final String term) { return ";".equals(term) && isValidEntity(preamble) ? '&' + preamble: "&" + preamble; } private boolean isValidEntity(final String entity) { return inArray(entity, vAllowedEntities); } private static boolean inArray(final String s, final String[] array) { for (String item : array) { if (item != null && item.equals(s)) { return true; } } return false; } private boolean allowed(final String name) { return (vAllowed.isEmpty() || vAllowed.containsKey(name))&& !inArray(name, vDisallowed); } private boolean allowedAttribute(final String name, final String paramName,final String paramValue) { if (!allowed(name)){ return false; } if (vAllowed.isEmpty()){ return false; } List<Attribute> list = vAllowed.get(name); if (null != list){ for (Attribute attr : list){ if (attr.attrName.equalsIgnoreCase(paramName)){ // see if no constaints if (null == attr.allowedAttrValues){ return true; }else{ Map<String, String> attrValues = parseAttrValues(paramValue); for (String key : attrValues.keySet()) { String value = attrValues.get(key); if (attr.allowedAttrValues.containsKey(key)&& attr.allowedAttrValues.get(key).matcher(value).matches()) { return true; } } } } } } // if((null != list) && (list.size() > 0)) return false; // return && (|| .contains(paramName)); } /** * * @param attrValue * * @return */ private static Map<String, String> parseAttrValues(String attrValue){ Map<String, String> values = new HashMap<String, String>(); if ((null != attrValue) && !"".equalsIgnoreCase(attrValue.trim())){ if (attrValue.startsWith("\"")){ attrValue = attrValue.substring(1); } if (attrValue.endsWith("\"")){ attrValue = attrValue.substring(0, attrValue.length() - 1); } String[] list = attrValue.split(";"); if (null != list){ for (String str : list){ int index = str.indexOf(":"); if ((index > 0) && (index < str.length())){ values.put(str.substring(0, index),str.substring(index + 1, str.length())); } } } } return values; } private String allMapString(String type, String htmltag, String old_str) { if (old_str == null || old_str.length() == 0) { return old_str; } Map<String, String> map = new HashMap<String, String>(); String allow_str = ""; List<Attribute> list = vAllowed.get(type); if (null != list) { for (Attribute attr : list) { if (attr.attrName.equalsIgnoreCase(htmltag)) { if (null == attr.allowedAttrValues) { return old_str; } else { Map<String, String> attrValues = (HashMap) attr.allowedAttrValues; if (old_str.indexOf(";") > -1) { String[] oldstr = old_str.split(";"); if (null != oldstr) { for (String str : oldstr) { if (str.indexOf(":") > -1) { String kv[] = str.split(":"); if (attrValues.containsKey(kv[0])) { if (!map.containsKey(kv[0])) { map.put(kv[0], kv[1]); } } } } } } } } } } if (map != null) { for (String key : map.keySet()) { allow_str += key + ":" + map.get(key) + ";"; } } return allow_str; } public static void main(String[] args) { // bgcolor // String // input="<table width='123' style='font-size:24px;color:#60d978;font-family:KaiTi_GB2312;'><tr><td>ssss</td></tr></table>"; // String // input="<JavaScript>bbb</script>asfdasfasfdasfasfd<iframe>test</iframe><a style='' href='http://localhost:8080/a.jsp'>aaa</a>"; // String input="<table width='123'><tr><td>ssss</td></tr></table>"; // String // input="<a href=\"aaaaaaaaa\" name=\"ddd\" id=\"llllllllll\">dddddd</a>"; // String // input="<span style='font-size:24px;color:#60d978;font-family:黑頭;'>您哈</span>"; String input = "<p>" + "<table style=\"width:100%;\" bordercolor=\"#000000\" cellspacing=\"0\" cellpadding=\"2\" border=\"1\">" + "<tbody>" + "<tr>" + "<td><span style=\"background-color:#e56600;\"><a href=\"http://www.baidu.com\" target=\"_blank\"> xx</a></span></td>" + "<td><span style=\"background-color:#e56600;\"> xxx</span></td>" + "</tr>" + "<tr>" + "<td> </td>" + "<td> </td>" + "</tr>" + "<tr>" + "<td> </td>" + "<td> </td>" + "</tr>" + "</tbody>" + "</table>" + "</p>" + "<ul>" + "<li>" + "<h3 align=\"center\"><em><u><strike><span style=\"font-size:32px;color:#60d978;font-family:Arial Black;\">xxxxxxxxxxxxxxxxx</span></strike></u></em></h3>" + "</li>" + "</ul>"; System.out.println(WebWhiteNameFilter.filter(input)); } }