1. 程式人生 > 其它 >配合python學習正則




learning regular expression in python.



正則表示式是從左到右來匹配一個字串的。“Regular Expression”這個詞太長了,我們通常使用它的縮寫“regex”或者“regexp”。






import re

def main():
    # <editor-fold desc='匹配任意字元關鍵字'>
    pattern = re.compile(r'.a.\.xls')
    result1 = pattern.findall('sales.xls'
'sales2.xls' 'sales3.xls' 'apac1.xls' 'europe2.xls' 'na1.xls' 'na2.xls' 'sa2.xls' 'ca1.xls'
) print(result1) # ['na1.xls', 'na2.xls', 'sa2.xls', 'ca1.xls'] # </editor-fold> # <editor-fold desc='匹配一組字元關鍵字[]'> pattern = re.compile(r'[ns]a.\.xls') result1 = pattern.findall('sales.xls' 'sales1.xls' 'orders3.xls' 'sales2.xls' 'sales3.xls' 'apac1.xls' 'europe2.xls' 'na1.xls' 'na2.xls' 'sa2.xls' 'ca1.xls') print(result1) # ['na1.xls', 'na2.xls', 'sa2.xls'] # </editor-fold> # <editor-fold desc='大小寫'> pattern = re.compile(r'[Rr]eg[Ee]x') result1 = pattern.findall('RegEx or regex or REGEX.') print(result1) # ['RegEx', 'regex'] # </editor-fold> # <editor-fold desc='集合區間'> pattern = re.compile(r'[ns]a[0123456789]\.xls') pattern = re.compile(r'[ns]a[0-9]\.xls') result1 = pattern.findall('sales.xls' 'sales1.xls' 'orders3.xls' 'sales2.xls' 'sales3.xls' 'apac1.xls' 'europe2.xls' 'sam.xls' 'na1.xls' 'na2.xls' 'sa1.xls' 'ca1.xls') print(result1) # ['na1.xls', 'na2.xls', 'sa1.xls'] # </editor-fold> # <editor-fold desc='讀取rgb值'> # [A-Z] 匹配A到Z所有大寫字母 # [a-z] 匹配a到z所有小寫字母 # [A-F] 匹配A到F所有大寫字母 # [A-z] 匹配從ASCII字元A到ASCII字元z的所有字母 # [A-Za-z0-9] 匹配任何一個字母無論大小寫或數字 pattern = re.compile(r'#[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]') result1 = pattern.findall('body { background-color: #fefdb8; }' 'h1 { background-color: #0000ff; }' 'div { background-color: #d0f4e6; }' 'span { background-color: #f08970; }') print(result1) # ['#fefdb8', '#0000ff', '#d0f4e6', '#f08970'] # </editor-fold> # <editor-fold desc='排除關鍵字^'> pattern = re.compile(r'[ns]a[^0-9]\.xls') result1 = pattern.findall('sales.xls' 'sales1.xls' 'orders3.xls' 'sales2.xls' 'sales3.xls' 'apac1.xls' 'europe2.xls' 'sam.xls' 'na1.xls' 'na2.xls' 'sa1.xls' 'ca1.xls') print(result1) # ['sam.xls'] # </editor-fold> # <editor-fold desc='匹配空白字元'> # 關鍵字 # [\b] 回退(並刪除)一個字元(Backspace) # \f 換頁符 # \n 換行符 # \r 回車符 # \t 製表符 # \v 垂直製表符 pattern = re.compile(r'\r\n\r\n') result1 = pattern.findall('101' '202' "" '303') print(result1) # 匹配空行 # </editor-fold> # <editor-fold desc='匹配特定字串型別'> # 匹配數字,非數字 # \d 任何一個數字字元 等價 [0-9] # \D 任何一個非數字字元 等價 [^0-9] # 匹配字母數字,非字母數字 # \w 任何一個字母數字字元(大小寫均可)或下劃線字元(等價於[a-zA-Z0-9_]) # \W 任何一個非字母數字字元或非下劃線字元(等價於[^a-zA-Z0-9_]) pattern = re.compile(r'\w\d\w\d\w\d') result1 = pattern.findall('11213\r\n' 'A1C2E3\r\n' '48075\r\n' '48237\r\n' 'M1B4F2\r\n' '90046\r\n' 'H1H2H2\r\n') print(result1) # ['A1C2E3', 'M1B4F2', 'H1H2H2'] # </editor-fold> # <editor-fold desc='匹配空白字元,非空白字元'> # \s 任何一個空白字元 等價 [\f\n\r\t\v] # \S 任何一個非空白字元 等價 [^\f\n\r\t\v] # </editor-fold> # <editor-fold desc='匹配十六進位制或八進位制數值'> # \x 十六進位制 # ex: \x0A 對應ASCII10 等價於 \n # \0 八進位制 # ex: \011 對應ASCII9 等價於\t # </editor-fold> # <editor-fold desc='POSIX, 不過py不支援'> # [:alnum:] 任何一個字母或數字(等價於[a-zA-Z0-9]) # [:alpha:] 任何一個字母(等價於[a-zA-Z]) # [:blank:] 空格或製表符(等價於[\t ]) # [:cntrl:] ASCII控制字元(ASCII 0到31, 再加上ASCII 127) # [:digit:] 任何一個數字(等價於[0-9]) # [:graph:] 和[:print:]一樣,但不包括空格 # [:lower:] 任何一個小寫字母(等價於[a-z]) # [:print:] 任何一個可列印字元 # [:punct:] 既不屬於[:alnum:], 也不屬於[:cntrl:]的任何一個字元 # [:space:] 任何一個空白字元,包括空格(等價於[\f\n\r\t\v\ ]) # [:upper:] 任何一個大寫字母(等價於[A-Z]) # [:xdigit:] 任何一個十六進位制數字(等價於[a-fA-F0-9]) # </editor-fold> # <editor-fold desc='匹配一個或多個字元'> # +號匹配 1~N個字元 # []內\.和.等價 pattern = re.compile(r'[\w\.][email protected][\w.]+\.\w+') result1 = pattern.findall('[email protected]\n' '[email protected]\n' '[email protected]\n' '[email protected]\n' '[email protected]\n') # ['[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]'] print(result1) # </editor-fold> # <editor-fold desc='匹配零個或多個字元'> # *號匹配 0~N個字元 # []內\.和.等價 pattern = re.compile(r'\w+[\w.]*@[\w.]+\.\w+') result1 = pattern.findall('[email protected]\n' '[email protected]\n') # ['[email protected]', '[email protected]'] print(result1) # </editor-fold> # <editor-fold desc='匹配零個或一個字元'> # ?號匹配 0~1個字元(最多一次) pattern = re.compile(r'https?:\/\/[\w.\/]+') result1 = pattern.findall('http://www.forta.com/ test https://www.forta.com/') # ['http://www.forta.com/', 'https://www.forta.com/'] print(result1) # </editor-fold> # <editor-fold desc='匹配重複次數'> # {次數} pattern = re.compile(r'#[0-9A-Fa-f]{6}') result1 = pattern.findall('body { background-color: #fefdb8; }' 'h1 { background-color: #0000ff; }' 'div { background-color: #d0f4e6; }' 'span { background-color: #f08970; }') print(result1) # ['#fefdb8', '#0000ff', '#d0f4e6', '#f08970'] # </editor-fold> # <editor-fold desc='區間範圍'> # {最小, 最大} ex:{2, 4} 最少重複2次, 最多重複4次 # {0,1} 等價於 ? pattern = re.compile(r'\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}') result1 = pattern.findall('4/8/17\n' '10-6-2018\n' '2/2/2\n' '01-01-01\n') # ['4/8/17', '10-6-2018', '01-01-01'] print(result1) # </editor-fold> # <editor-fold desc='至少重複多少次'> # {至少次數,} 重複 至少次數 或更多 # {1,} 等價於 + # 找出金額大於100的 pattern = re.compile(r'\d+: \$\d{3,}\.\d{2}') result1 = pattern.findall('1001: $496.80\n' '1002: $1290.69\n' '1003: $26.43\n' '1004: $613.42\n' '1005: $7.61\n' '1006: $414.90\n' '1007: $25.00\n') # ['1001: $496.80', '1002: $1290.69', '1004: $613.42', '1006: $414.90'] print(result1) # </editor-fold> # <editor-fold desc='防止過度匹配'> # * + {} 都是greedy型 # 後面新增?就是 懶惰版本 # ['<b>AK</b> and <b>HI</b>'] pattern = re.compile(r'<[Bb]>.*<\/[Bb]>') # ['<b>AK</b>', '<b>HI</b>'] pattern = re.compile(r'<[Bb]>.*?<\/[Bb]>') result1 = pattern.findall('This offer is not available to customer living in <b>AK</b> and <b>HI</b>') print(result1) # </editor-fold> # <editor-fold desc='單詞邊界'> # \b 匹配單詞邊界 pattern = re.compile(r'\bcat\b') result1 = pattern.findall('The cat scattered his food all over the room.') # ['cat'] print(result1) pattern = re.compile(r'\bcap') result1 = pattern.findall('captain cap cape recap') # ['cap', 'cap', 'cap'] print(result1) pattern = re.compile(r'cap\b') result1 = pattern.findall('captain cap cape recap') # ['cap', 'cap'] print(result1) # </editor-fold> # <editor-fold desc='非單詞邊界'> # \B 匹配非單詞邊界 pattern = re.compile(r'\B-\B') result1 = pattern.findall('color - coded nine-digit') # ['-'] print(result1) # </editor-fold> # <editor-fold desc='字串邊界'> # ^ 字串開頭 ([]內^才是取反) # $ 字串結尾 # 如果xml之前有其他字元或者其他行, 就會匹配失敗 # ['<?xml version='1.0' encoding='utf-8'?>'] pattern = re.compile(r'^\s*<\?xml.*\?>') # 匹配尾巴的 # ['<manifest>'] pattern = re.compile(r'<manifest>$') result1 = pattern.findall('<?xml version=\'1.0\' encoding=\'utf-8\'?><manifest><manifest>') print(result1) # </editor-fold> # <editor-fold desc='多行模式'> # (?m) 開啟多行模式, 開啟後會把換行符視為字串分隔符 # 這樣就可以用^$匹配字串換行後的起始和結束位置 pattern = re.compile(r'(?m)^\s*\/\/.*$') result1 = pattern.findall('// 註釋1 \n' 'code123\n' '// 註釋2\n' 'code123 \n' '//註釋123 註釋456') # ['// 註釋1 ', '// 註釋2', '//註釋123 註釋456'] print(result1) # </editor-fold> # <editor-fold desc='子表示式'> # () 關鍵字 # 無法正確匹配, 只能匹配 &nbsp;;; pattern = re.compile(r'&nbsp;{2,}') pattern = re.compile(r'(&nbsp;){2,}') result1 = pattern.findall('Test&nbsp;&nbsp;&nbsp;&nbsp;Test') # ['// 註釋1 ', '// 註釋2', '//註釋123 註釋456'] print(result1) # [''] pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # [('', '14.')] pattern = re.compile(r'((\d{1,3}\.){3}\d{1,3})') # ['14.'] pattern = re.compile(r'(\d{1,3}\.){3}\d{1,3}') # [''] pattern = re.compile(r'(?:\d{1,3}\.){3}\d{1,3}') result1 = pattern.findall('[]') print(result1) # <re.Match object; span=(1, 14), match=''> print(re.search(pattern, '[]')) # [('1967', '19')] pattern = re.compile(r'((19|20)\d{2})') result1 = pattern.findall('1967-08-17') print(result1) # </editor-fold> # <editor-fold desc='補充上面子表示式中的()和findall在py下的注意'> s = 'adfad asdfasdf asdfas asdfawef asd adsfas ' reObj1 = re.compile(r'((\w+)\s+\w+)') # [('adfad asdfasdf', 'adfad'), ('asdfas asdfawef', 'asdfas'), ('asd adsfas', 'asd')] print(reObj1.findall(s)) reObj2 = re.compile(r'(\w+)\s+\w+') # ['adfad', 'asdfas', 'asd'] print(reObj2.findall(s)) reObj3 = re.compile(r'\w+\s+\w+') # ['adfad asdfasdf', 'asdfas asdfawef', 'asd adsfas'] print(reObj3.findall(s)) # 按以上程式碼例子講解: # # findall函式返回的總是正則表示式在字串中所有匹配結果的列表 # 此處主要討論列表中'結果'的展現方式 # 即findall中返回列表中每個元素包含的資訊 # 1.當給出的正則表示式中帶有多個括號時 # 列表的元素為多個字串組成的tuple # tuple中字串個數與括號對數相同 # 字串內容與每個括號內的正則表示式相對應 # 並且排放順序是按括號出現的順序。 # # 2.當給出的正則表示式中帶有一個括號時 # 列表的元素為字串 # 此字串的內容與括號中的正則表示式相對應(不是整個正則表示式的匹配內容) # # 3.當給出的正則表示式中不帶括號時 # 列表的元素為字串 # 此字串為整個正則表示式匹配的內容 # # </editor-fold> # <editor-fold desc='匹配有效ip'> # 注意順序 pattern = re.compile( r'((((25[0-5])|(2[0-4]\d)|(1[0-9]\d)|(\d{1,2}))\.){3}((25[0-5])|(2[0-4]\d)|(1[0-9]\d)|(\d{1,2})))') result1 = pattern.findall('[]') # [('', '46.', '46', '', '', '159', '46', '200', '', '200', '', '')] print(result1) # 替換成更符合邏輯的寫法會有問題 # 從左往右匹配, 如果滿足就會給出結果 pattern = re.compile( r'((((\d{1,2})|(1[0-9]\d)|(2[0-4]\d)|(25[0-5])|)\.){3}((\d{1,2})|(1[0-9]\d)|(2[0-4]\d)|(25[0-5])|))') result1 = pattern.findall('[]') # [('', '46.', '46', '46', '159', '', '', '20', '20', '', '', '')] print(result1) # </editor-fold> # <editor-fold desc='反向引用或回溯引用(backreference)'> # ()代表一個子表示式 # 之後使用\n(1開始)代表引用先前的第n個表示式 # 部分正則實現中\0可以代表整個表示式 # 注意多個()的情況 # ['of', 'and', 'are'] pattern = re.compile(r'[ ](\w+)[ ]\1') # [(' of ', 'of'), (' and ', 'and'), (' are ', 'are')] pattern = re.compile(r'([ ](\w+)[ ])\2') # [(' of of', 'of'), (' and and', 'and'), (' are are', 'are')] pattern = re.compile(r'([ ](\w+)[ ]\2)') result1 = pattern.findall('xx of of xxx and and xxx are are.') print(result1) pattern = re.compile(r'(<[hH]([1-6])>.*?<\/[hH]\2>)') result1 = pattern.findall('<body>\n' '<h1>Test1</h1>\n' 'Test2\n' '<h2>Test3</h2>\n' 'Test4\n' '<h3>Test5</h3>\n' 'Test6<br/>\n' '</body>\n') # [('<h1>Test1</h1>', '1'), ('<h2>Test3</h2>', '2'), ('<h3>Test5</h3>', '3')] print(result1) # </editor-fold> # <editor-fold desc='替換操作'> # 替換的串中和backreference一樣使用\n替換子表示式 pattern = re.compile(r'([\w\.][email protected][\w.]+\.\w+)') s = '[email protected]\n' + \ 'abcdefg\n' + \ '[email protected]\n' + \ 'abcdefg' result1 = pattern.findall(s) replstr = r'<a href="mailto:\1">\1</a>' # ['[email protected]', '[email protected]'] print(result1) print('---------------') print(s) print('----↓↓↓↓↓↓↓----') print(pattern.sub(replstr, s)) print('---------------') pattern = re.compile(r'(\d{3})(-)(\d{3})(-)(\d{4})') s = '333-157-1507\n' + \ '123-403-1570\n' + \ '111-578-8456\n' + \ '234-237-4856' result1 = pattern.findall(s) replstr = r'(\1) \3-\5' # [('333', '-', '157', '-', '1507'), # ('123', '-', '403', '-', '1570'), # ('111', '-', '578', '-', '8456'), # ('234', '-', '237', '-', '4856')] print(result1) print('---------------') print(s) print('----↓↓↓↓↓↓↓----') print(pattern.sub(replstr, s)) print('---------------') # </editor-fold> # <editor-fold desc='大小寫轉換'> # 正則配合backreference # \E \L,\U的結束符 # \l 將下一個字元轉換成小寫 # \L 將\L至\E之間的字元都轉換成小寫 # \u 將下一個字元轉換成大寫 # \U 將\U至\E之間的字元都轉換成小寫 # py的話需要像下面這樣處理 # 以上關鍵字會報錯 pattern = re.compile(r'([\w\.][email protected][\w.]+\.\w+)') s = '[email protected]\n' + \ 'abcdefg\n' + \ '[email protected]\n' + \ 'abcdefg' result1 = pattern.findall(s) # 'Test[{\U\1\E}] def callback(word): return 'Test[{}]'.format(word.group(1).upper()) # ['[email protected]', '[email protected]'] print(result1) print('---------------') print(s) print('----↓↓↓↓↓↓↓----') print(pattern.sub(callback, s)) print('---------------') # </editor-fold> # <editor-fold desc='向前查詢'> # 檢視已匹配文字之後的內容 # (?=) # ['https:', 'https:'] pattern = re.compile(r'.+:') # ['https', 'https'] pattern = re.compile(r'.+(?=:)') result1 = pattern.findall('https://www.bilibili.com/\n' 'https://www.baidu.com/\n') print(result1) # </editor-fold> # <editor-fold desc='向後查詢'> # (?<=) pattern = re.compile(r'(?<=\$)[\d.]+') result1 = pattern.findall('1.24\n' '4685.6845\n' '$ 4685.6845\n' '$15978685.45\n' '$12346785852.54$\n' '4568.96 $64987.69\n') # ['15978685.45', '12346785852.54', '64987.69'] print(result1) # </editor-fold> # <editor-fold desc='結合向前向後'> pattern = re.compile(r'(?<=\<[tT][iI][tT][lL][eE]\>).*(?=\<\/[tT][iI][tT][lL][eE]\>)') result1 = pattern.findall('<head>\n' '<title>Test 123 learning regex.</title>\n' '</head>') # ['Test 123 learning regex.'] print(result1) # </editor-fold> # <editor-fold desc='否定式環視'> # = 替換為 ! # ?= --> ?! # ?<= --> ?<! pattern = re.compile(r'\b(?<!\$)\d+\b') result1 = pattern.findall('1 24\n' '$30\n' '200\n' '$ 300123\n' '$15945\n' '$123454$\n' '456896 $6498769\n') # ['1', '24', '466845', '4686845', '456896'] print(result1) # </editor-fold> # <editor-fold desc='子表示式條件'> # ?(n) n為子條件序號 # 新增條件判斷 # 下面這個的解釋: # 直觀的看可能比較懵逼 # 第一步先搜尋左括號( # 如果有左括號才匹配後面出現的右括號 # 否則就匹配- # [('123-456-7890', ''), ('(123)456-7890', '('), ('123-456-7890', '')] pattern = re.compile(r'((\()?\d{3}(?(2)\)|-)\d{3}-\d{4})') # [('123-456-7890', '', '-', ''), ('(123)456-7890', '(', ')', ''), ('(123-456-7890', '(', '-', '')] pattern = re.compile(r'((\()?\d{3}((2)?\)|-)\d{3}-\d{4})') result1 = pattern.findall('123-456-7890\n' '(123)456-7890\n' '(123)-456-7890\n' '(123-456-7890\n' '1234567890\n' '123 456 7890\n') # [('123-456-7890', ''), ('(123)456-7890', '('), ('123-456-7890', '')] print(result1) # 配合?=使用 pattern = re.compile(r'(\d{5}((?=-)?-\d{4}))') result1 = pattern.findall('11111\n' '22222\n' '33333-\n' '44444-5555\n') # [('44444-5555', '-5555')] print(result1) # </editor-fold> if __name__ == '__main__': main() pass