1. 程式人生 > 其它 >將txt文字由utf-8轉gbk

將txt文字由utf-8轉gbk

import codecs

# file = open(path, encoding='gbk', errors='ignore')
# print(file.readline())


path = "locator5.txt" #輸入檔案路徑,要保證格式為utf-8,window下文字開啟另存為utf-8,
                    # linux下檔案直接轉gbk  [[email protected]: ~]$ iconv -f utf-8 -t gbk utf_file > gbk_file
path2 = "locator6.txt"
path3 = "locator.json" def ReadFile(filePath,encoding="utf-8"): with codecs.open(filePath, "r", encoding) as f: return f.read() def ReadFile2(filePath,encoding="gbk"): with codecs.open(filePath, "r", encoding) as f: return f.
read() def WriteFile(filePath,u,encoding="gbk"): with codecs.open(filePath, "w", encoding) as f: f.write(u) def UTF8_2_GBK(src,dst): content = ReadFile(src, encoding="utf-8") WriteFile(dst, content, encoding="gbk") def UTF8_2_GBK2(src,dst)
: content = ReadFile(src, encoding="utf-8") WriteFile(dst, content, encoding="gb18030") def UTF8_2_GBK3(src,dst): content = ReadFile(src, encoding="gbk") WriteFile(dst, content, encoding="gbk") UTF8_2_GBK2(path,path2) # # a = ReadFile2(path3) # b = WriteFile(path2) # UTF8_2_GBK2(a,b)

``

還有順便補一個去除中文的

#隨便記一個去除中文的
```python
import re
from zhon.hanzi import punctuation
from zhon.hanzi import characters

def lm_find_unchinese(file):
    pattern = re.compile(r'[\u4e00-\u9fa5]')
    unchinese = re.sub(pattern,"",file) #排除漢字
    unchinese = re.sub('[{}]'.format(punctuation),"",unchinese) #排除中文符號
    #print("unchinese:",unchinese)
    return unchinese