C#檢查目錄下所有檔案的編碼格式並轉換為UTF8格式
阿新 • • 發佈:2019-02-16
1、檢查目錄下所有檔案的編碼格式(遞迴)
2、將不是utf8的格式轉換為UTF8格式
.config
<appSettings> <add key="ExceptList" value=".git,.nuget,.vs,.dll,.pdb, .png,.jpg,.gif ,.log,.eot, .ttf, .woff, .swf, packages , ReferenceDLL ,.cache,.xls, .xlsx,.doc,.docx,favicon.ico,_references.js,.exe" /> <add key="isChangeToUTF8" value="true" /> </appSettings>
public partial class Form1 : Form { public Form1() { InitializeComponent(); this.txt_url.Text = "G:\\WorkSpace\\messagecenter"; } private static List<string> ExceptList = new List<string>(); private static bool isChangeToUTF8 = false; private List<string> ResultList=new List<string>(); static Form1() { System.Configuration.AppSettingsReader appReader = new System.Configuration.AppSettingsReader(); string strExcept= Convert.ToString(appReader.GetValue("ExceptList", typeof(string))); string strIsChange = Convert.ToString(appReader.GetValue("isChangeToUTF8", typeof(string))); if (!string.IsNullOrEmpty(strExcept)) { var tempExcept = strExcept.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); foreach (var temp in tempExcept) { ExceptList.Add(temp.Trim()); } } if (!string.IsNullOrEmpty(strIsChange) && strIsChange.ToUpper()=="TRUE") { isChangeToUTF8 = true; } } private void btn_start_Click(object sender, EventArgs e) { ResultList = new List<string>(); string myPath = txt_url.Text; if (string.IsNullOrEmpty(myPath)) { MessageBox.Show("PATH?"); return; } List<string> pathList = new List<string>(); try { //查詢目錄下的所有檔案 GetDirectory(myPath, pathList); foreach (var path in pathList) { //獲取檔案編碼格式 var type = EncodingType.GetType(path, ResultList); if (type != Encoding.UTF8) { ResultList.Add($"{path},原格式{type.EncodingName}"); if (isChangeToUTF8) { //轉換編碼格式至UTF8 ChangeEncoding(path, type); } } } } catch (Exception exception) { MessageBox.Show(exception.ToString()); return; } if (ResultList.Any()) { ResultList.Insert(0,myPath); ResultList.Insert(1,$"共計檔案{pathList.Count}個"); ResultList.Insert(2,$"非UTF-8檔案共{ResultList.Count-2}個"); byte[] myByte = System.Text.Encoding.UTF8.GetBytes(string.Join(Environment.NewLine, ResultList.ToArray())); using (FileStream fsWrite = new FileStream($"D:\\result{DateTime.Now.ToString("yyyyMMddHHmmss")}.txt", FileMode.Append)) { fsWrite.Write(myByte, 0, myByte.Length); }; MessageBox.Show("D:\\result.txt"); } else { ResultList.Insert(0, myPath); ResultList.Insert(1, $"共計檔案{pathList.Count}個"); ResultList.Insert(2, $"非UTF-8檔案共{ResultList.Count - 2}個"); byte[] myByte = System.Text.Encoding.UTF8.GetBytes(string.Join(Environment.NewLine, ResultList.ToArray())); using (FileStream fsWrite = new FileStream($"D:\\success{DateTime.Now.ToString("yyyyMMddHHmmss")}.txt", FileMode.Append)) { fsWrite.Write(myByte, 0, myByte.Length); }; MessageBox.Show("finish!"); } } private void GetDirectory(string path, List<string> list) { DirectoryInfo folder = new DirectoryInfo(path); GetFile(path,list); foreach (var directory in folder.GetDirectories()) { if (!ExceptList.Contains(directory.Name)) { string childPath = $"{path}\\{directory.Name}"; GetDirectory(childPath, list); } } } private void GetFile(string path, List<string> list) { DirectoryInfo folder = new DirectoryInfo(path); foreach (FileInfo file in folder.GetFiles()) { if (!ExceptList.Any(e => file.Name.EndsWith(e))) { list.Add($"{path}\\{file.Name}"); } } } private void ChangeEncoding(string filename , System.Text.Encoding encoding) { System.IO.FileStream fs = new System.IO.FileStream(filename, System.IO.FileMode.Open, System.IO.FileAccess.Read); byte[] flieByte = new byte[fs.Length]; fs.Read(flieByte, 0, flieByte.Length); fs.Close(); StreamWriter docWriter; System.Text.Encoding ec = System.Text.Encoding.GetEncoding("UTF-8"); docWriter = new StreamWriter(filename, false, ec); docWriter.Write(encoding.GetString(flieByte)); docWriter.Close(); } }
/// <summary> /// 獲取檔案的編碼格式 /// </summary> public class EncodingType { /// <summary> /// 給定檔案的路徑,讀取檔案的二進位制資料,判斷檔案的編碼型別 /// </summary> /// <param name=“FILE_NAME“>檔案路徑</param> /// <returns>檔案的編碼型別</returns> public static System.Text.Encoding GetType(string FILE_NAME ,List<string> ResultList) { FileStream fs = new FileStream(FILE_NAME, FileMode.Open, FileAccess.Read); Encoding r = GetType(fs, FILE_NAME, ResultList); fs.Close(); return r; } /// <summary> /// 通過給定的檔案流,判斷檔案的編碼型別 /// </summary> /// <param name=“fs“>檔案流</param> /// <returns>檔案的編碼型別</returns> public static System.Text.Encoding GetType(FileStream fs, string FILE_NAME , List<string> ResultList) { //byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 }; //byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 }; //byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //帶BOM Encoding reVal = Encoding.Default; BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default); int i; int.TryParse(fs.Length.ToString(), out i); byte[] ss = r.ReadBytes(i); if (IsUTF8Bytes(ss, FILE_NAME, ResultList) || (ss.Length > 3 && ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)) { reVal = Encoding.UTF8; } else if (ss.Length > 3 && ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) { reVal = Encoding.BigEndianUnicode; } else if (ss.Length>3 && ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) { reVal = Encoding.Unicode; } r.Close(); return reVal; } /// <summary> /// 判斷是否是不帶 BOM 的 UTF8 格式 /// </summary> /// <param name=“data“></param> /// <returns></returns> private static bool IsUTF8Bytes(byte[] data, string FILE_NAME, List<string> ResultList) { int charByteCounter = 1; //計算當前正分析的字元應還有的位元組數 byte curByte; //當前分析的位元組. for (int i = 0; i < data.Length; i++) { curByte = data[i]; if (charByteCounter == 1) { if (curByte >= 0x80) { //判斷當前 while (((curByte <<= 1) & 0x80) != 0) { charByteCounter++; } //標記位首位若為非0 則至少以2個1開始 如:110XXXXX...........1111110X if (charByteCounter == 1 || charByteCounter > 6) { return false; } } } else { //若是UTF-8 此時第一位必須為1 if ((curByte & 0xC0) != 0x80) { return false; } charByteCounter--; } } if (charByteCounter > 1) { ResultList.Add($"{FILE_NAME},異常:非預期的byte格式,無法判斷是否是UTF8(不帶BOM)格式,已跳過"); //throw new Exception("非預期的byte格式"); } return true; } }