java判斷txt檔案的編碼格式
阿新 • • 發佈:2019-01-11
/** * txt轉html * @param s * @return */ public static String txtToHtml(String s) { try { StringBuilder builder = new StringBuilder(); File file=new File(s); if(file.isFile() && file.exists()){ //判斷檔案是否存在 String encoding=getFilecharset(new File(s)); InputStreamReader read = new InputStreamReader( new FileInputStream(file),encoding);//考慮到編碼格式 BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; while((lineTxt = bufferedReader.readLine()) != null){ boolean previousWasASpace = false; for (char c : (lineTxt+"\n").toCharArray()) { if (c == ' ') { if (previousWasASpace) { builder.append(" "); previousWasASpace = false; continue; } previousWasASpace = true; } else { previousWasASpace = false; } switch (c) { case '<': builder.append("<"); break; case '>': builder.append(">"); break; case '&': builder.append("&"); break; case '"': builder.append(""); break; case '\n': builder.append("<br>"); break; // We need Tab support here, because we print StackTraces as HTML case '\t': builder.append(" "); break; default: builder.append(c); } } } read.close(); String converted = builder.toString(); String str = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\".,<>?«»“”‘’]))"; Pattern patt = Pattern.compile(str); Matcher matcher = patt.matcher(converted); converted = matcher.replaceAll("<a href=\"$1\">$1</a>"); return converted; }else{ logger.error("找不到指定的檔案"); return null; } } catch (Exception e) { logger.error("讀取檔案內容出錯"); e.printStackTrace(); return null; } } //判斷編碼格式方法 private static String getFilecharset(File sourceFile) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) { return charset; //檔案編碼為 ANSI } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; //檔案編碼為 Unicode checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; //檔案編碼為 Unicode big endian checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; //檔案編碼為 UTF-8 checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 雙位元組 (0xC0 - 0xDF) // (0x80 // - 0xBF),也可能在GB編碼內 continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是機率較小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } bis.close(); } catch (Exception e) { e.printStackTrace(); } return charset; }
轉自:http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/