語料處理之全形轉半形
阿新 • • 發佈:2019-02-15
該指令碼的功能是把文字檔案中的全形字元轉換為半形字元:
# -*- coding: utf-8 -*- def strQ2B(inputFilePath,outputFilePath): outputFile = open(outputFilePath,'w') with open(inputFilePath) as inputFile: lines = inputFile.readlines() for line in lines: ustring = line.decode('utf-8') rstring = "" for uchar in ustring: inside_code=ord(uchar) if inside_code == 12288: #全形空格直接轉換 inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): #全形字元(除空格)根據關係轉化 inside_code -= 65248 rstring += unichr(inside_code) outputFile.write(rstring.encode('utf-8')) outputFile.close() if __name__ == "__main__": inputFilePath = "../1.txt" outputFilePath = "../2.txt" strQ2B(inputFilePath,outputFilePath)