JAVA正則解析Pattern.compile(regex)出現java.util.regex.PatternSyntaxException
阿新 • • 發佈:2018-12-11
問題
最近線上專案出現了java.util.regex.PatternSyntaxException,專案也沒什麼改動,除了特殊
字元表的字符集由於原來是utf8編碼的字符集,不支援4個位元組的字元,修改成了utf8mb4位元組,其餘
的也沒什麼改動.異常原因如下
異常貼圖
解析
private void everyMsgInDB(List<MessageSampleMsg> msgs,String speCharRegex,Map<String, String> varWordMap){
...省略
content = DataFormat. removeSpeChar(content, speCharRegex); //問題在這
...省略
}
public static String removeSpeChar(String content, String regex) {
Pattern p = Pattern.compile(regex); //最終問題確定在這裡
Matcher matcher = p.matcher(content);
return matcher.replaceAll("");
}
到這發現原來問題出現在組裝的regex,看下面regex的組裝
public static String getSpeCharRegex (Connection conn) {
SpecialCharDao specialCharDao = DaoFactory.getSpecialCharDao();
List<String> spchars = null;
try {
spchars = specialCharDao.getAll(conn);//這是獲取所有的特殊字元
} catch (SQLException e) {
log.error("特殊字元查詢失敗", e);
}
StringBuffer sbf = new StringBuffer();
//將每個特殊字元用或和轉義字元去拼接
for (String spchar : spchars) {
sbf.append("\\").append(spchar).append("|");
}
return sbf.substring(0, sbf.length() - 1);
}
}
拼接好的regex如圖一所示,那麼為什麼\ying這裡會出現異常呢?接下來分析下
Pattern.conpile(String regex)原始碼,下面的原始碼是JDK1.8
//1.
public static Pattern compile(String regex) {
return new Pattern(regex, 0);
}
//2.
private Pattern(String p, int f) {
pattern = p;
flags = f; //這裡flags == 0
//0 & 任何數都 == 0,這裡可忽略
if ((flags & UNICODE_CHARACTER_CLASS) != 0)
flags |= UNICODE_CASE;
//可忽略
capturingGroupCount = 1;
localCount = 0;
//這個pattern就是前面傳進來的字串【\ying】
if (pattern.length() > 0) {
//然後到這裡面
compile();
} else {
root = new Start(lastAccept);
matchRoot = lastAccept;
}
}
//3.
private void compile() {
...省略
temp = new int[patternLength + 2];//這裡temp是字元的ASCII碼對應的十進位制數
// 這裡是組裝temp陣列,見下面的temp陣列貼圖
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
c = normalizedPattern.codePointAt(x);
if (isSupplementary(c)) {
hasSupplementary = true;
}
temp[count++] = c;
}
...省略
if (has(LITERAL)) {
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
} else {
// 來到遞迴下降解析
matchRoot = expr(lastAccept);
}
...省略
}
//4.
private Node expr(Node end) {
...省略
for (;;) {
//會到這裡
Node node = sequence(end);
Node nodeTail = root; // double return
...省略
}
...省略
}
//5.
private Node sequence(Node end) {
...省略
LOOP: for (;;) {
//前面的temp為{92,121,105,110,103},這裡會拿到ch == 92
//對應的ASCII為\\
int ch = peek();
switch (ch) {
...省略
//所以匹配到了這裡
case '\\':
//到這裡看一下下一個是不是還要跳過
//下一個為121,對應的ASCII為y
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
boolean comp = (ch == 'P');
ch = next(); // Consume { if present
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
node = family(oneLetter, comp);
//所以來到這裡
} else {
//這一步是讓指標往前回退一會
//即這時,指標來到了92的位置
unread();
//然後來到這裡
node = atom();
}
break;
...省略
}
}
//6.繼續下來
private Node atom() {
int first = 0;
...省略
int ch = peek();
for (;;) {
switch (ch) {
...省略
//因為前面指標回退,所以匹配到了這裡
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
if (first > 0) {
unread();
break;
} else {
boolean comp = (ch == 'P');
boolean oneLetter = true;
ch = next();
if (ch != '{')
unread();
else
oneLetter = false;
return family(oneLetter, comp);
}
}
unread();
prev = cursor;
//然後來到這裡
//這裡進去的引數為false,true,false
ch = escape(false, first == 0, false);
...省略
}
private int escape(boolean inclass, boolean create, boolean isrange) {
//這裡是讓指標指向y,還記得前面指標已經回退到\了嗎
int ch = skip();
//下面的switch如果是return就沒問題
//如果是break就要丟擲異常了,程式就中斷了
switch (ch) {
case '0':
return o();
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (inclass)
break;
if (create) {
root = ref((ch - '0'));
}
return -1;
...省略
case 'l':
//這裡也有問題
case 'm':
break;
case 'n':
return '\n';
//看這裡,如果是o,p,q會被break,就會到最一行丟擲異常
case 'o':
case 'p':
case 'q':
break;
...省略
case 'w':
if (create)
root = has(UNICODE_CHARACTER_CLASS) ? new Utype(UnicodeProp.WORD) : new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
//還記得我是的\ying,這裡匹配的是y所以丟擲異常
//到這裡算是找到問題的根源了
case 'y':
break;
case 'z':
if (inclass)
break;
if (create)
root = new End();
return -1;
default:
return ch;
}
throw error("Illegal/unsupported escape sequence");
}
下圖是 int[] temp 對應的陣列
結論
如果要通過以下方式進行正則匹配一定要注意,加轉移字元的時候一定要注意,注意字元後面一定不要跟a-zA-Z0-9否則有可能造成異常的出現。
public static String removeSpeChar(String content, String regex) {
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(content);
return matcher.replaceAll("");
}
所以程式碼修改了一下。
public static String getSpeCharRegex(Connection conn) {
SpecialCharDao specialCharDao = DaoFactory.getSpecialCharDao();
List<String> spchars = null;
try {
spchars = specialCharDao.getAll(conn);
} catch (SQLException e) {
log.error("特殊字元查詢失敗", e);
}
StringBuffer sbf = new StringBuffer();
for (String spchar : spchars) {
//添加了這麼一句
//如果字元不是以A-Za-z0-9之間的需要新增轉移字元
if(!spchar.matches("[A-Za-z0-9]*")){
sbf.append("\\");
}
sbf.append(spchar).append("|");
}
return sbf.substring(0, sbf.length() - 1);
}