將txt文字由utf-8轉gbk
阿新 • • 發佈:2020-12-24
import codecs
# file = open(path, encoding='gbk', errors='ignore')
# print(file.readline())
path = "locator5.txt" #輸入檔案路徑,要保證格式為utf-8,window下文字開啟另存為utf-8,
# linux下檔案直接轉gbk [[email protected]: ~]$ iconv -f utf-8 -t gbk utf_file > gbk_file
path2 = "locator6.txt"
path3 = "locator.json"
def ReadFile(filePath,encoding="utf-8"):
with codecs.open(filePath, "r", encoding) as f:
return f.read()
def ReadFile2(filePath,encoding="gbk"):
with codecs.open(filePath, "r", encoding) as f:
return f. read()
def WriteFile(filePath,u,encoding="gbk"):
with codecs.open(filePath, "w", encoding) as f:
f.write(u)
def UTF8_2_GBK(src,dst):
content = ReadFile(src, encoding="utf-8")
WriteFile(dst, content, encoding="gbk")
def UTF8_2_GBK2(src,dst) :
content = ReadFile(src, encoding="utf-8")
WriteFile(dst, content, encoding="gb18030")
def UTF8_2_GBK3(src,dst):
content = ReadFile(src, encoding="gbk")
WriteFile(dst, content, encoding="gbk")
UTF8_2_GBK2(path,path2)
#
# a = ReadFile2(path3)
# b = WriteFile(path2)
# UTF8_2_GBK2(a,b)
``
還有順便補一個去除中文的
#隨便記一個去除中文的
```python
import re
from zhon.hanzi import punctuation
from zhon.hanzi import characters
def lm_find_unchinese(file):
pattern = re.compile(r'[\u4e00-\u9fa5]')
unchinese = re.sub(pattern,"",file) #排除漢字
unchinese = re.sub('[{}]'.format(punctuation),"",unchinese) #排除中文符號
#print("unchinese:",unchinese)
return unchinese