1. 程式人生 > 其它 >java如何獲取一個文字檔案的編碼(格式)資訊呢?

java如何獲取一個文字檔案的編碼(格式)資訊呢?

轉自:http://www.java265.com/JavaJingYan/202110/16350332691561.html

文字檔案是我們在windows平臺下常用的一種檔案格式,

這種格式會隨著作業系統的語言不同,而出現其預設的編碼不同

那麼如何使用程式獲取“文字檔案”的編碼方式呢?

檔案編碼的格式決定了檔案可儲存的字元型別,所以得到檔案的型別至關重要


下文筆者講述獲取一個文字檔案的格式資訊的方法分享,如下所示:

實現思路:
    通過獲取檔案流的前3個位元組
	判斷其值的方式,即可獲取文字檔案的編碼方式

例:

package com.java265.other;
import java.io.File;
import java.io.FileInputStream; import java.io.InputStream; public class Test { /* * java265.com 獲取文字檔案的編碼方式 * **/ public static void main(String[] args) { File file = new File("E://person/java265.com/java.txt"); System.out.println(GetEncoding(file)); }
public static String GetEncoding(File file) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; InputStream is = new FileInputStream(file); int read = is.read(first3Bytes, 0, 3); if (read == -1)
return charset; if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; checked = true; }else if (first3Bytes[0] == (byte) 0xA && first3Bytes[1] == (byte) 0x5B && first3Bytes[2] == (byte) 0x30) { charset = "UTF-8"; checked = true; }else if (first3Bytes[0] == (byte) 0xD && first3Bytes[1] == (byte) 0xA && first3Bytes[2] == (byte) 0x5B) { charset = "GBK"; checked = true; }else if (first3Bytes[0] == (byte) 0x5B && first3Bytes[1] == (byte) 0x54 && first3Bytes[2] == (byte) 0x49) { charset = "windows-1251"; checked = true; } //bis.reset(); InputStream istmp = new FileInputStream(file); if (!checked) { int loc = 0; while ((read = istmp.read()) != -1) { loc++; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) break; if (0xC0 <= read && read <= 0xDF) { read = istmp.read(); if (0x80 <= read && read <= 0xBF) continue; else break; } else if (0xE0 <= read && read <= 0xEF) { read = istmp.read(); if (0x80 <= read && read <= 0xBF) { read = istmp.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } is.close(); istmp.close(); } catch (Exception e) { e.printStackTrace(); } return charset; } }