關於String替換操作的一點筆記
阿新 • • 發佈:2019-01-24
最近專案需要抓取學校百合的一些熱點資訊,免不了頻繁使用正則和String的一些替換操作,遇到了一些問題,值得小記一下。
下面是一個操作的片段
Pattern textareaContent = Pattern.compile("(?s)(<table)(.*?)<textarea.*?class=hide>(.*?)</textarea>");
由於抓取的內容可能還有‘$’,‘\\’等字元,在appendReplacement(StringBuffer,String replacement)中可能會導致錯誤,比如$在replace可以作為group的選擇器。其實可以通過jdk的原始碼明確的看出appendRelacement的處理方式:Matcher contentMatcher = textareaContent.matcher(resultHTML); StringBuffer buff = new StringBuffer(); while(contentMatcher.find()) { contentMatcher.appendReplacement(buff, contentMatcher.group(1) + " style='BORDER: 2px solid;BORDER-COLOR: D0F0C0;' " + contentMatcher.group(2) + contentMatcher.group(3)); } resultHTML = contentMatcher.appendTail(buff).toString();
char nextChar = replacement.charAt(cursor); if (nextChar == '\\') {//當讀到'\\'時直接跳過將nextChar壓入buffer cursor++; nextChar = replacement.charAt(cursor); result.append(nextChar); cursor++; } else if (nextChar == '$') {//當讀取到'$'時,根據nextChar不同處理不同 // Skip past $跳過了'$'!!!!! cursor++; // A StringIndexOutOfBoundsException is thrown if // this "$" is the last character in replacement // string in current implementation, a IAE might be // more appropriate. nextChar = replacement.charAt(cursor); int refNum = -1; if (nextChar == '{') { cursor++;//跳過'{' StringBuilder gsb = new StringBuilder(); while (cursor < replacement.length()) {//將'{'後的字母和數字暫存 nextChar = replacement.charAt(cursor); if (ASCII.isLower(nextChar) || ASCII.isUpper(nextChar) || ASCII.isDigit(nextChar)) { gsb.append(nextChar); cursor++; } else { break; } } if (gsb.length() == 0)//如果buffer裡沒有就報錯 throw new IllegalArgumentException( "named capturing group has 0 length name"); if (nextChar != '}') throw new IllegalArgumentException( "named capturing group is missing trailing '}'"); String gname = gsb.toString(); if (ASCII.isDigit(gname.charAt(0)))//組名不可能以數字開頭 throw new IllegalArgumentException( "capturing group name {" + gname + "} starts with digit character"); if (!parentPattern.namedGroups().containsKey(gname))//在pattern中查詢組 throw new IllegalArgumentException( "No group with name {" + gname + "}"); refNum = parentPattern.namedGroups().get(gname); cursor++; } else {//如果不是上述情況那下一個char應當是字元
處理的方法:Matcher.quoteReplacement()// The first number is always a group refNum = (int)nextChar - '0'; if ((refNum < 0)||(refNum > 9)) throw new IllegalArgumentException( "Illegal group reference"); cursor++; // Capture the largest legal group string boolean done = false; while (!done) { if (cursor >= replacement.length()) { break; } int nextDigit = replacement.charAt(cursor) - '0'; if ((nextDigit < 0)||(nextDigit > 9)) { // not a number break; } int newRefNum = (refNum * 10) + nextDigit; if (groupCount() < newRefNum) { done = true; } else { refNum = newRefNum; cursor++; } } } // Append group if (start(refNum) != -1 && end(refNum) != -1) result.append(text, start(refNum), end(refNum)); }
if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
return s;
StringBuilder sb = new StringBuilder();
for (int i=0; i<s.length(); i++) {
char c = s.charAt(i);
if (c == '\\' || c == '$') {
sb.append('\\');
}
sb.append(c);
}
return sb.toString();
在特殊字前插入'\\'‘;
另外String.replace()
public String replace(CharSequence target, CharSequence replacement) {
return Pattern.compile(target.toString(), Pattern.LITERAL).matcher(
this).replaceAll(Matcher.quoteReplacement(replacement.toString()));
}
是通過Matcher.replaceAll來實現的。