1. 程式人生 > >十六、檢測txt檔案的編碼格式

十六、檢測txt檔案的編碼格式

private static String detectCodeFormate(File file) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
            bis.mark(0);
            int
read = bis.read(first3Bytes, 0, 3); if (read == -1) { //檔案編碼為 ANSI return charset; } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { //檔案編碼為 Unicode charset = "UTF-16LE"; checked =
true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { //檔案編碼為 Unicode big endian charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) { //檔案編碼為 UTF-8 charset = "UTF-8"; checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0){ break; } // 單獨出現BF以下的,也算是GBK if (0x80 <= read && read <= 0xBF) {break;} if (0xC0 <= read && read <= 0xDF) { read = bis.read(); // 雙位元組 (0xC0 - 0xDF) if (0x80 <= read && read <= 0xBF) { // (0x80 - 0xBF),也可能在GB編碼內 continue; } else { break; } } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是機率較小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else { break; } } else { break; } } } } bis.close(); } catch (Exception e) { e.printStackTrace(); } return charset; }