配合python學習正則

阿新 • • 發佈：2021-01-07

技術標籤：Python

最近在git開了個小庫…

learning regular expression in python.

什麼是正則表示式?

正則表示式是一種被用於從文字中檢索符合某些特定模式的文字。

正則表示式是從左到右來匹配一個字串的。“Regular Expression”這個詞太長了，我們通常使用它的縮寫“regex”或者“regexp”。
正則表示式可以被用來替換字串中的文字、驗證表單、基於模式匹配從一個字串中提取字串等等。

這個庫是幹嘛的?

之前的很多正則教程有脫離語言相關性的教程和使用相關語言進行教學的。
但是多多少少都有一些問題，例如完全脫離語言的正則教程中的部分語法在某些語言中使用起來會和期望結果不一致。

於是就有了這個python實現的一些常用正則的用法展示，方便學習。

庫中的內容

轉載請註明出處啊~~

#!/usr/bin/python
import re


def main():
    # <editor-fold desc='匹配任意字元關鍵字'>
    pattern = re.compile(r'.a.\.xls')
    result1 = pattern.findall('sales.xls'
                              'sales1.xls'
                              'orders3.xls' 

                              'sales2.xls'
                              'sales3.xls'
                              'apac1.xls'
                              'europe2.xls'
                              'na1.xls'
                              'na2.xls'
                              'sa2.xls'
                              'ca1.xls' 
)

    print(result1)  # ['na1.xls', 'na2.xls', 'sa2.xls', 'ca1.xls']
    # </editor-fold>

    # <editor-fold desc='匹配一組字元關鍵字[]'>
    pattern = re.compile(r'[ns]a.\.xls')
    result1 = pattern.findall('sales.xls'
                              'sales1.xls'
                              'orders3.xls'
                              'sales2.xls'
                              'sales3.xls'
                              'apac1.xls'
                              'europe2.xls'
                              'na1.xls'
                              'na2.xls'
                              'sa2.xls'
                              'ca1.xls')

    print(result1)  # ['na1.xls', 'na2.xls', 'sa2.xls']
    # </editor-fold>

    # <editor-fold desc='大小寫'>
    pattern = re.compile(r'[Rr]eg[Ee]x')

    result1 = pattern.findall('RegEx or regex or REGEX.')

    print(result1)  # ['RegEx', 'regex']
    # </editor-fold>

    # <editor-fold desc='集合區間'>
    pattern = re.compile(r'[ns]a[0123456789]\.xls')
    pattern = re.compile(r'[ns]a[0-9]\.xls')

    result1 = pattern.findall('sales.xls'
                              'sales1.xls'
                              'orders3.xls'
                              'sales2.xls'
                              'sales3.xls'
                              'apac1.xls'
                              'europe2.xls'
                              'sam.xls'
                              'na1.xls'
                              'na2.xls'
                              'sa1.xls'
                              'ca1.xls')

    print(result1)  # ['na1.xls', 'na2.xls', 'sa1.xls']
    # </editor-fold>

    # <editor-fold desc='讀取rgb值'>
    # [A-Z] 匹配A到Z所有大寫字母
    # [a-z] 匹配a到z所有小寫字母
    # [A-F] 匹配A到F所有大寫字母
    # [A-z] 匹配從ASCII字元A到ASCII字元z的所有字母
    # [A-Za-z0-9] 匹配任何一個字母無論大小寫或數字

    pattern = re.compile(r'#[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]')

    result1 = pattern.findall('body { background-color: #fefdb8; }'
                              'h1   { background-color: #0000ff; }'
                              'div  { background-color: #d0f4e6; }'
                              'span { background-color: #f08970; }')

    print(result1)  # ['#fefdb8', '#0000ff', '#d0f4e6', '#f08970']
    # </editor-fold>

    # <editor-fold desc='排除關鍵字^'>
    pattern = re.compile(r'[ns]a[^0-9]\.xls')

    result1 = pattern.findall('sales.xls'
                              'sales1.xls'
                              'orders3.xls'
                              'sales2.xls'
                              'sales3.xls'
                              'apac1.xls'
                              'europe2.xls'
                              'sam.xls'
                              'na1.xls'
                              'na2.xls'
                              'sa1.xls'
                              'ca1.xls')

    print(result1)  # ['sam.xls']
    # </editor-fold>

    # <editor-fold desc='匹配空白字元'>
    # 關鍵字
    # [\b] 回退(並刪除)一個字元(Backspace)
    # \f 換頁符
    # \n 換行符
    # \r 回車符
    # \t 製表符
    # \v 垂直製表符
    pattern = re.compile(r'\r\n\r\n')

    result1 = pattern.findall('101'
                              '202'
                              ""
                              '303')

    print(result1)  # 匹配空行
    # </editor-fold>

    # <editor-fold desc='匹配特定字串型別'>
    # 匹配數字,非數字
    # \d 任何一個數字字元 等價 [0-9]
    # \D 任何一個非數字字元 等價 [^0-9]

    # 匹配字母數字,非字母數字
    # \w 任何一個字母數字字元(大小寫均可)或下劃線字元(等價於[a-zA-Z0-9_])
    # \W 任何一個非字母數字字元或非下劃線字元(等價於[^a-zA-Z0-9_])
    pattern = re.compile(r'\w\d\w\d\w\d')

    result1 = pattern.findall('11213\r\n'
                              'A1C2E3\r\n'
                              '48075\r\n'
                              '48237\r\n'
                              'M1B4F2\r\n'
                              '90046\r\n'
                              'H1H2H2\r\n')

    print(result1)  # ['A1C2E3', 'M1B4F2', 'H1H2H2']
    # </editor-fold>

    # <editor-fold desc='匹配空白字元,非空白字元'>
    # \s 任何一個空白字元 等價 [\f\n\r\t\v]
    # \S 任何一個非空白字元 等價 [^\f\n\r\t\v]

    # </editor-fold>

    # <editor-fold desc='匹配十六進位制或八進位制數值'>
    # \x 十六進位制
    # ex: \x0A 對應ASCII10 等價於 \n

    # \0 八進位制
    # ex: \011 對應ASCII9 等價於\t

    # </editor-fold>

    # <editor-fold desc='POSIX, 不過py不支援'>

    # [:alnum:] 任何一個字母或數字(等價於[a-zA-Z0-9])
    # [:alpha:] 任何一個字母(等價於[a-zA-Z])
    # [:blank:] 空格或製表符(等價於[\t ])
    # [:cntrl:] ASCII控制字元(ASCII 0到31, 再加上ASCII 127)
    # [:digit:] 任何一個數字(等價於[0-9])
    # [:graph:] 和[:print:]一樣,但不包括空格
    # [:lower:] 任何一個小寫字母(等價於[a-z])
    # [:print:] 任何一個可列印字元
    # [:punct:] 既不屬於[:alnum:], 也不屬於[:cntrl:]的任何一個字元
    # [:space:] 任何一個空白字元,包括空格(等價於[\f\n\r\t\v\ ])
    # [:upper:] 任何一個大寫字母(等價於[A-Z])
    # [:xdigit:] 任何一個十六進位制數字(等價於[a-fA-F0-9])

    # </editor-fold>

    # <editor-fold desc='匹配一個或多個字元'>
    # +號匹配 1~N個字元
    # []內\.和.等價
    pattern = re.compile(r'[\w\.][email protected][\w.]+\.\w+')

    result1 = pattern.findall('[email protected]\n'
                              '[email protected]\n'
                              '[email protected]\n'
                              '[email protected]\n'
                              '[email protected]\n')

    # ['[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='匹配零個或多個字元'>
    # *號匹配 0~N個字元
    # []內\.和.等價
    pattern = re.compile(r'\w+[\w.]*@[\w.]+\.\w+')

    result1 = pattern.findall('[email protected]\n'
                              '[email protected]\n')

    # ['[email protected]', '[email protected]']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='匹配零個或一個字元'>
    # ?號匹配 0~1個字元(最多一次)
    pattern = re.compile(r'https?:\/\/[\w.\/]+')

    result1 = pattern.findall('http://www.forta.com/ test https://www.forta.com/')

    # ['http://www.forta.com/', 'https://www.forta.com/']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='匹配重複次數'>
    # {次數}
    pattern = re.compile(r'#[0-9A-Fa-f]{6}')

    result1 = pattern.findall('body { background-color: #fefdb8; }'
                              'h1   { background-color: #0000ff; }'
                              'div  { background-color: #d0f4e6; }'
                              'span { background-color: #f08970; }')

    print(result1)  # ['#fefdb8', '#0000ff', '#d0f4e6', '#f08970']

    # </editor-fold>

    # <editor-fold desc='區間範圍'>
    # {最小, 最大} ex:{2, 4} 最少重複2次, 最多重複4次
    # {0,1} 等價於 ?
    pattern = re.compile(r'\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}')

    result1 = pattern.findall('4/8/17\n'
                              '10-6-2018\n'
                              '2/2/2\n'
                              '01-01-01\n')

    # ['4/8/17', '10-6-2018', '01-01-01']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='至少重複多少次'>
    # {至少次數,} 重複 至少次數 或更多
    # {1,} 等價於 +
    # 找出金額大於100的
    pattern = re.compile(r'\d+: \$\d{3,}\.\d{2}')

    result1 = pattern.findall('1001: $496.80\n'
                              '1002: $1290.69\n'
                              '1003: $26.43\n'
                              '1004: $613.42\n'
                              '1005: $7.61\n'
                              '1006: $414.90\n'
                              '1007: $25.00\n')

    # ['1001: $496.80', '1002: $1290.69', '1004: $613.42', '1006: $414.90']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='防止過度匹配'>
    # * + {} 都是greedy型
    # 後面新增?就是 懶惰版本

    # ['<b>AK</b> and <b>HI</b>']
    pattern = re.compile(r'<[Bb]>.*<\/[Bb]>')
    # ['<b>AK</b>', '<b>HI</b>']
    pattern = re.compile(r'<[Bb]>.*?<\/[Bb]>')

    result1 = pattern.findall('This offer is not available to customer living in <b>AK</b> and <b>HI</b>')

    print(result1)

    # </editor-fold>

    # <editor-fold desc='單詞邊界'>
    # \b 匹配單詞邊界

    pattern = re.compile(r'\bcat\b')

    result1 = pattern.findall('The cat scattered his food all over the room.')

    # ['cat']
    print(result1)

    pattern = re.compile(r'\bcap')

    result1 = pattern.findall('captain cap cape recap')

    # ['cap', 'cap', 'cap']
    print(result1)

    pattern = re.compile(r'cap\b')

    result1 = pattern.findall('captain cap cape recap')

    # ['cap', 'cap']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='非單詞邊界'>
    # \B 匹配非單詞邊界

    pattern = re.compile(r'\B-\B')

    result1 = pattern.findall('color - coded nine-digit')

    # ['-']
    print(result1)
    # </editor-fold>

    # <editor-fold desc='字串邊界'>
    # ^ 字串開頭 ([]內^才是取反)
    # $ 字串結尾

    # 如果xml之前有其他字元或者其他行, 就會匹配失敗
    # ['<?xml version='1.0' encoding='utf-8'?>']
    pattern = re.compile(r'^\s*<\?xml.*\?>')
    # 匹配尾巴的
    # ['<manifest>']
    pattern = re.compile(r'<manifest>$')

    result1 = pattern.findall('<?xml version=\'1.0\' encoding=\'utf-8\'?><manifest><manifest>')

    print(result1)

    # </editor-fold>

    # <editor-fold desc='多行模式'>
    # (?m) 開啟多行模式, 開啟後會把換行符視為字串分隔符
    # 這樣就可以用^$匹配字串換行後的起始和結束位置

    pattern = re.compile(r'(?m)^\s*\/\/.*$')

    result1 = pattern.findall('// 註釋1 \n'
                              'code123\n'
                              '// 註釋2\n'
                              'code123 \n'
                              '//註釋123 註釋456')

    # ['// 註釋1 ', '// 註釋2', '//註釋123 註釋456']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='子表示式'>
    # () 關鍵字

    # 無法正確匹配, 只能匹配 &nbsp;;;
    pattern = re.compile(r'&nbsp;{2,}')

    pattern = re.compile(r'(&nbsp;){2,}')

    result1 = pattern.findall('Test&nbsp;&nbsp;&nbsp;&nbsp;Test')

    # ['// 註釋1 ', '// 註釋2', '//註釋123 註釋456']
    print(result1)

    # ['12.123.12.200']
    pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

    # [('12.123.14.200', '14.')]
    pattern = re.compile(r'((\d{1,3}\.){3}\d{1,3})')

    # ['14.']
    pattern = re.compile(r'(\d{1,3}\.){3}\d{1,3}')

    # ['12.123.14.200']
    pattern = re.compile(r'(?:\d{1,3}\.){3}\d{1,3}')

    result1 = pattern.findall('[12.123.14.200]')

    print(result1)

    # <re.Match object; span=(1, 14), match='12.123.14.200'>
    print(re.search(pattern, '[12.123.14.200]'))

    # [('1967', '19')]
    pattern = re.compile(r'((19|20)\d{2})')

    result1 = pattern.findall('1967-08-17')

    print(result1)

    # </editor-fold>

    # <editor-fold desc='補充上面子表示式中的()和findall在py下的注意'>

    s = 'adfad asdfasdf asdfas asdfawef asd adsfas '

    reObj1 = re.compile(r'((\w+)\s+\w+)')

    # [('adfad asdfasdf', 'adfad'), ('asdfas asdfawef', 'asdfas'), ('asd adsfas', 'asd')]
    print(reObj1.findall(s))

    reObj2 = re.compile(r'(\w+)\s+\w+')

    # ['adfad', 'asdfas', 'asd']
    print(reObj2.findall(s))

    reObj3 = re.compile(r'\w+\s+\w+')

    # ['adfad asdfasdf', 'asdfas asdfawef', 'asd adsfas']
    print(reObj3.findall(s))

    # 按以上程式碼例子講解:
    #
    # findall函式返回的總是正則表示式在字串中所有匹配結果的列表
    # 此處主要討論列表中'結果'的展現方式
    # 即findall中返回列表中每個元素包含的資訊
    # 1.當給出的正則表示式中帶有多個括號時
    #   列表的元素為多個字串組成的tuple
    #   tuple中字串個數與括號對數相同
    #   字串內容與每個括號內的正則表示式相對應
    #   並且排放順序是按括號出現的順序。
    #
    # 2.當給出的正則表示式中帶有一個括號時
    #   列表的元素為字串
    #   此字串的內容與括號中的正則表示式相對應(不是整個正則表示式的匹配內容)
    #
    # 3.當給出的正則表示式中不帶括號時
    #   列表的元素為字串
    #   此字串為整個正則表示式匹配的內容
    #
    # </editor-fold>

    # <editor-fold desc='匹配有效ip'>
    # 注意順序
    pattern = re.compile(
        r'((((25[0-5])|(2[0-4]\d)|(1[0-9]\d)|(\d{1,2}))\.){3}((25[0-5])|(2[0-4]\d)|(1[0-9]\d)|(\d{1,2})))')

    result1 = pattern.findall('[12.159.46.200]')

    # [('12.159.46.200', '46.', '46', '', '', '159', '46', '200', '', '200', '', '')]
    print(result1)

    # 替換成更符合邏輯的寫法會有問題
    # 從左往右匹配, 如果滿足就會給出結果
    pattern = re.compile(
        r'((((\d{1,2})|(1[0-9]\d)|(2[0-4]\d)|(25[0-5])|)\.){3}((\d{1,2})|(1[0-9]\d)|(2[0-4]\d)|(25[0-5])|))')

    result1 = pattern.findall('[12.159.46.200]')

    # [('12.159.46.20', '46.', '46', '46', '159', '', '', '20', '20', '', '', '')]
    print(result1)

    # </editor-fold>

    # <editor-fold desc='反向引用或回溯引用(backreference)'>
    # ()代表一個子表示式
    # 之後使用\n(1開始)代表引用先前的第n個表示式
    # 部分正則實現中\0可以代表整個表示式
    # 注意多個()的情況

    # ['of', 'and', 'are']
    pattern = re.compile(r'[ ](\w+)[ ]\1')

    # [(' of ', 'of'), (' and ', 'and'), (' are ', 'are')]
    pattern = re.compile(r'([ ](\w+)[ ])\2')

    # [(' of of', 'of'), (' and and', 'and'), (' are are', 'are')]
    pattern = re.compile(r'([ ](\w+)[ ]\2)')

    result1 = pattern.findall('xx of of xxx and and xxx are are.')

    print(result1)

    pattern = re.compile(r'(<[hH]([1-6])>.*?<\/[hH]\2>)')

    result1 = pattern.findall('<body>\n'
                              '<h1>Test1</h1>\n'
                              'Test2\n'
                              '<h2>Test3</h2>\n'
                              'Test4\n'
                              '<h3>Test5</h3>\n'
                              'Test6<br/>\n'
                              '</body>\n')

    # [('<h1>Test1</h1>', '1'), ('<h2>Test3</h2>', '2'), ('<h3>Test5</h3>', '3')]
    print(result1)

    # </editor-fold>

    # <editor-fold desc='替換操作'>
    # 替換的串中和backreference一樣使用\n替換子表示式

    pattern = re.compile(r'([\w\.][email protected][\w.]+\.\w+)')

    s = '[email protected]\n' + \
        'abcdefg\n' + \
        '[email protected]\n' + \
        'abcdefg'

    result1 = pattern.findall(s)
    replstr = r'<a href="mailto:\1">\1</a>'

    # ['[email protected]', '[email protected]']
    print(result1)

    print('---------------')
    print(s)
    print('----↓↓↓↓↓↓↓----')
    print(pattern.sub(replstr, s))
    print('---------------')

    pattern = re.compile(r'(\d{3})(-)(\d{3})(-)(\d{4})')

    s = '333-157-1507\n' + \
        '123-403-1570\n' + \
        '111-578-8456\n' + \
        '234-237-4856'

    result1 = pattern.findall(s)
    replstr = r'(\1) \3-\5'

    # [('333', '-', '157', '-', '1507'),
    # ('123', '-', '403', '-', '1570'),
    # ('111', '-', '578', '-', '8456'),
    # ('234', '-', '237', '-', '4856')]
    print(result1)

    print('---------------')
    print(s)
    print('----↓↓↓↓↓↓↓----')
    print(pattern.sub(replstr, s))
    print('---------------')

    # </editor-fold>

    # <editor-fold desc='大小寫轉換'>
    # 正則配合backreference
    # \E \L,\U的結束符
    # \l 將下一個字元轉換成小寫
    # \L 將\L至\E之間的字元都轉換成小寫
    # \u 將下一個字元轉換成大寫
    # \U 將\U至\E之間的字元都轉換成小寫
    # py的話需要像下面這樣處理
    # 以上關鍵字會報錯

    pattern = re.compile(r'([\w\.][email protected][\w.]+\.\w+)')

    s = '[email protected]\n' + \
        'abcdefg\n' + \
        '[email protected]\n' + \
        'abcdefg'

    result1 = pattern.findall(s)

    # 'Test[{\U\1\E}]
    def callback(word): return 'Test[{}]'.format(word.group(1).upper())

    # ['[email protected]', '[email protected]']
    print(result1)

    print('---------------')
    print(s)
    print('----↓↓↓↓↓↓↓----')
    print(pattern.sub(callback, s))
    print('---------------')

    # </editor-fold>

    # <editor-fold desc='向前查詢'>
    # 檢視已匹配文字之後的內容
    # (?=)

    # ['https:', 'https:']
    pattern = re.compile(r'.+:')

    # ['https', 'https']
    pattern = re.compile(r'.+(?=:)')

    result1 = pattern.findall('https://www.bilibili.com/\n'
                              'https://www.baidu.com/\n')

    print(result1)

    # </editor-fold>

    # <editor-fold desc='向後查詢'>
    # (?<=)

    pattern = re.compile(r'(?<=\$)[\d.]+')

    result1 = pattern.findall('1.24\n'
                              '4685.6845\n'
                              '$ 4685.6845\n'
                              '$15978685.45\n'
                              '$12346785852.54$\n'
                              '4568.96 $64987.69\n')

    # ['15978685.45', '12346785852.54', '64987.69']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='結合向前向後'>

    pattern = re.compile(r'(?<=\<[tT][iI][tT][lL][eE]\>).*(?=\<\/[tT][iI][tT][lL][eE]\>)')

    result1 = pattern.findall('<head>\n'
                              '<title>Test 123 learning regex.</title>\n'
                              '</head>')

    # ['Test 123 learning regex.']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='否定式環視'>
    # = 替換為 !
    # ?=  --> ?!
    # ?<= --> ?<!

    pattern = re.compile(r'\b(?<!\$)\d+\b')

    result1 = pattern.findall('1 24\n'
                              '$30\n'
                              '200\n'
                              '$ 300123\n'
                              '$15945\n'
                              '$123454$\n'
                              '456896 $6498769\n')

    # ['1', '24', '466845', '4686845', '456896']
    print(result1)

    # </editor-fold>

    # <editor-fold desc='子表示式條件'>
    # ?(n) n為子條件序號
    # 新增條件判斷

    # 下面這個的解釋:
    # 直觀的看可能比較懵逼
    # 第一步先搜尋左括號(
    # 如果有左括號才匹配後面出現的右括號
    # 否則就匹配-

    # [('123-456-7890', ''), ('(123)456-7890', '('), ('123-456-7890', '')]
    pattern = re.compile(r'((\()?\d{3}(?(2)\)|-)\d{3}-\d{4})')

    # [('123-456-7890', '', '-', ''), ('(123)456-7890', '(', ')', ''), ('(123-456-7890', '(', '-', '')]
    pattern = re.compile(r'((\()?\d{3}((2)?\)|-)\d{3}-\d{4})')

    result1 = pattern.findall('123-456-7890\n'
                              '(123)456-7890\n'
                              '(123)-456-7890\n'
                              '(123-456-7890\n'
                              '1234567890\n'
                              '123 456 7890\n')

    # [('123-456-7890', ''), ('(123)456-7890', '('), ('123-456-7890', '')]
    print(result1)

    # 配合?=使用
    pattern = re.compile(r'(\d{5}((?=-)?-\d{4}))')

    result1 = pattern.findall('11111\n'
                              '22222\n'
                              '33333-\n'
                              '44444-5555\n')

    # [('44444-5555', '-5555')]
    print(result1)

    # </editor-fold>


if __name__ == '__main__':
    main()
    pass