python 全形半形字元轉換
阿新 • • 發佈:2018-12-10
1.相關原理
全形即:Double Byte Character,簡稱DBC
半形即:Single Byte Character,簡稱SBC
在 windows 中,中文和全形字元都佔兩個位元組,並且使用了 ascii chart 2 (codes 128–255);
全形字元的第一個位元組總是被置為 163,而第二個位元組則是相同半形字元碼加上128(不包括空格,全形空格和半形空格也要考慮進去);
對於中文來說,它的第一個位元組被置為大於163,如’阿’為:176 162,檢測到中文時不進行轉換。
例如:半形 a 為 65,則全形 a 是 163(第一個位元組)、193(第二個位元組,128+65)。
具體規則為:
- 全形字元unicode編碼從65281~65374 (十六進位制 0xFF01 ~ 0xFF5E)
- 半形字元unicode編碼從33~126 (十六進位制 0x21~ 0x7E)
- 空格比較特殊,全形為 12288(0x3000),半形為 32(0x20)
- 而且除空格外,全形/半形按unicode編碼排序在順序上是對應的(半形 + 65248 = 全形)
所以可以直接通過用±法來處理非空格資料,對空格單獨處理。
用到的一些函式
chr()
函式用一個範圍在range(256)內的(就是0~255)整數作引數,返回一個對應的字元。unichr()
跟它一樣,只不過返回的是Unicode字元。ord()
下面做個試驗:
for i in xrange(33,127):
print i,chr(i),i+65248,unichr(i+65248)
結果如下:
33 ! 65281 ! 34 " 65282 " 35 # 65283 # 36 $ 65284 $ 37 % 65285 % 38 & 65286 & 39 ' 65287 ' 40 ( 65288 ( 41 ) 65289 ) 42 * 65290 * 43 + 65291 + 44 , 65292 , 45 - 65293 - 46 . 65294 . 47 / 65295 / 48 0 65296 0 49 1 65297 1 50 2 65298 2 51 3 65299 3 52 4 65300 4 53 5 65301 5 54 6 65302 6 55 7 65303 7 56 8 65304 8 57 9 65305 9 58 : 65306 : 59 ; 65307 ; 60 < 65308 < 61 = 65309 = 62 > 65310 > 63 ? 65311 ? 64 @ 65312 @ 65 A 65313 A 66 B 65314 B 67 C 65315 C 68 D 65316 D 69 E 65317 E 70 F 65318 F 71 G 65319 G 72 H 65320 H 73 I 65321 I 74 J 65322 J 75 K 65323 K 76 L 65324 L 77 M 65325 M 78 N 65326 N 79 O 65327 O 80 P 65328 P 81 Q 65329 Q 82 R 65330 R 83 S 65331 S 84 T 65332 T 85 U 65333 U 86 V 65334 V 87 W 65335 W 88 X 65336 X 89 Y 65337 Y 90 Z 65338 Z 91 [ 65339 [ 92 \ 65340 \ 93 ] 65341 ] 94 ^ 65342 ^ 95 _ 65343 _ 96 ` 65344 ` 97 a 65345 a 98 b 65346 b 99 c 65347 c 100 d 65348 d 101 e 65349 e 102 f 65350 f 103 g 65351 g 104 h 65352 h 105 i 65353 i 106 j 65354 j 107 k 65355 k 108 l 65356 l 109 m 65357 m 110 n 65358 n 111 o 65359 o 112 p 65360 p 113 q 65361 q 114 r 65362 r 115 s 65363 s 116 t 65364 t 117 u 65365 u 118 v 65366 v 119 w 65367 w 120 x 65368 x 121 y 65369 y 122 z 65370 z 123 { 65371 { 124 | 65372 | 125 } 65373 } 126 ~ 65374 ~
2. 相關程式碼
2.1 全形轉半形
def full2half(s):
n = []
s = s.decode('utf-8')
for char in s:
num = ord(char)
if num == 0x3000:
num = 32
elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0
num = unichr(num)
n.append(num)
return ''.join(n)
2.2 半形轉全形
def half2full(s):
n = []
s = s.decode('utf-8')
for char in s:
num = char(char)
if num == 320:
num = 0x3000
elif 0x21 <= num <= 0x7E:
num += 0xfee0
num = unichr(num)
n.append(num)
return ''.join(n)
2.3 自定義轉換
上面的實現方式非常的簡單,但是現實情況下可能並不會把所以的字元統一進行轉換,比如中文文章中我們期望將所有出現的字母和數字全部轉化成半形,而常見標點符號統一使用全形,上面的轉化就不適合了。解決方案,是自定義詞典。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
FH_SPACE = FHS = ((u" ", u" "),)
FH_NUM = FHN = (
(u"0", u"0"), (u"1", u"1"), (u"2", u"2"), (u"3", u"3"), (u"4", u"4"),
(u"5", u"5"), (u"6", u"6"), (u"7", u"7"), (u"8", u"8"), (u"9", u"9"),
)
FH_ALPHA = FHA = (
(u"a", u"a"), (u"b", u"b"), (u"c", u"c"), (u"d", u"d"), (u"e", u"e"),
(u"f", u"f"), (u"g", u"g"), (u"h", u"h"), (u"i", u"i"), (u"j", u"j"),
(u"k", u"k"), (u"l", u"l"), (u"m", u"m"), (u"n", u"n"), (u"o", u"o"),
(u"p", u"p"), (u"q", u"q"), (u"r", u"r"), (u"s", u"s"), (u"t", u"t"),
(u"u", u"u"), (u"v", u"v"), (u"w", u"w"), (u"x", u"x"), (u"y", u"y"), (u"z", u"z"),
(u"A", u"A"), (u"B", u"B"), (u"C", u"C"), (u"D", u"D"), (u"E", u"E"),
(u"F", u"F"), (u"G", u"G"), (u"H", u"H"), (u"I", u"I"), (u"J", u"J"),
(u"K", u"K"), (u"L", u"L"), (u"M", u"M"), (u"N", u"N"), (u"O", u"O"),
(u"P", u"P"), (u"Q", u"Q"), (u"R", u"R"), (u"S", u"S"), (u"T", u"T"),
(u"U", u"U"), (u"V", u"V"), (u"W", u"W"), (u"X", u"X"), (u"Y", u"Y"), (u"Z", u"Z"),
)
FH_PUNCTUATION = FHP = (
(u".", u"."), (u",", u","), (u"!", u"!"), (u"?", u"?"), (u"”", u'"'),
(u"’", u"'"), (u"‘", u"`"), (u"@", u"@"), (u"_", u"_"), (u":", u":"),
(u";", u";"), (u"#", u"#"), (u"$", u"$"), (u"%", u"%"), (u"&", u"&"),
(u"(", u"("), (u")", u")"), (u"‐", u"-"), (u"=", u"="), (u"*", u"*"),
(u"+", u"+"), (u"-", u"-"), (u"/", u"/"), (u"<", u"<"), (u">", u">"),
(u"[", u"["), (u"¥", u"\\"), (u"]", u"]"), (u"^", u"^"), (u"{", u"{"),
(u"|", u"|"), (u"}", u"}"), (u"~", u"~"),
)
FH_ASCII = HAC = lambda: ((fr, to) for m in (FH_ALPHA, FH_NUM, FH_PUNCTUATION) for fr, to in m)
HF_SPACE = HFS = ((u" ", u" "),)
HF_NUM = HFN = lambda: ((h, z) for z, h in FH_NUM)
HF_ALPHA = HFA = lambda: ((h, z) for z, h in FH_ALPHA)
HF_PUNCTUATION = HFP = lambda: ((h, z) for z, h in FH_PUNCTUATION)
HF_ASCII = ZAC = lambda: ((h, z) for z, h in FH_ASCII())
def convert(text, *maps, **ops):
""" 全形/半形轉換
args:
text: unicode string need to convert
maps: conversion maps
skip: skip out of character. In a tuple or string
return: converted unicode string
"""
if "skip" in ops:
skip = ops["skip"]
if isinstance(skip, basestring):
skip = tuple(skip)
def replace(text, fr, to):
return text if fr in skip else text.replace(fr, to)
else:
def replace(text, fr, to):
return text.replace(fr, to)
for m in maps:
if callable(m):
m = m()
elif isinstance(m, dict):
m = m.items()
for fr, to in m:
text = replace(text, fr, to)
return text
if __name__ == '__main__':
text = u"成田空港—【JR特急成田エクスプレス號・橫浜行,2站】—東京—【JR新幹線はやぶさ號・新青森行,6站 】—新青森—【JR特急スーパー白鳥號・函館行,4站 】—函館"
print convert(text, FH_ASCII, {u"【": u"[", u"】": u"]", u",": u",", u".": u"。", u"?": u"?", u"!": u"!"}, spit=",。?!“”")
2.4 demo
fin = open("foward_standard.log",'r')
fout = open("foward_standard2.log",'w')
lines = fin.readlines()
def strB2Q(ustring):
""""""
ustring = ustring.decode('gb18030')
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 32:
inside_code = 32#12288
elif inside_code >= 32 and inside_code <= 126:
inside_code += 65248
rstring += unichr(inside_code)
#rstring.encode('gb18030')
return rstring
def transferStr(ustring):
#ustring = ustring.decode('gb18030')
try:
rstring = ""
for uchar in ustring:
inside_code=ord(uchar)
if inside_code == 0x0020 or inside_code==0x3000 or inside_code==0x003f or inside_code==0xff1f:
inside_code = 63
elif inside_code == 12288:
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374):
inside_code -= 65248
rstring += unichr(inside_code)
rstring = rstring.lower()
return rstring.encode('gb18030')
except:
ustring = ustring.lower()
return ustring.encode('gb18030')
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
for line in lines:
fout.write((strB2Q(line.split("\t")[0])+'\t'+line.split("\t")[1]).encode("gb18030"))