python模塊之re模塊
阿新 • • 發佈:2019-01-12
sea 進行 可能 重復 列表 ani 哈哈 開頭 dot
1. 正則
正則就是用一些具有特殊意義的符號組合到一起(正則表達式)來描述字符或者字符串的方法,在python中正則匹配時通過re模塊來實現的
2. re模塊
單個字符匹配
# \w與\W # s2 = "df當你 的_眼 睛瞇|著/笑?sh29 sedn" # print(re.findall("\w", s2)) # [‘d‘, ‘f‘, ‘當‘, ‘你‘, ‘的‘, ‘_‘, ‘眼‘, ‘睛‘, ‘瞇‘, ‘著‘, ‘笑‘, ‘s‘, ‘h‘, ‘2‘, ‘9‘, ‘s‘, ‘e‘, ‘d‘, ‘n‘] # print(re.findall("\W", s2)) # [‘ ‘, ‘ ‘, ‘ ‘, ‘|‘, ‘/‘, ‘?‘, ‘ ‘]# \s與\S # s3 = "df當\b 2\t瞇|著/笑\r?sh\n29 d\nn" # print(re.findall("\s", s3)) # [‘ ‘, ‘ ‘, ‘\t‘, ‘\r‘, ‘\n‘, ‘ ‘, ‘\n‘] # print(re.findall("\S", s3)) # [‘d‘, ‘f‘, ‘當‘, ‘\x08‘, ‘2‘, ‘瞇‘, ‘|‘, ‘著‘, ‘/‘, ‘笑‘, ‘?‘, ‘s‘, ‘h‘, ‘2‘, ‘9‘, ‘d‘, ‘n‘] # \d與\D # print(re.findall("\d", s3)) # [‘2‘, ‘2‘, ‘9‘]# print(re.findall("\D", s3)) # [‘d‘, ‘f‘, ‘當‘, ‘\x08‘, ‘ ‘, ‘ ‘, ‘\t‘, ‘瞇‘, ‘|‘, ‘著‘, ‘/‘, ‘笑‘, ‘\r‘, ‘?‘, ‘s‘, ‘h‘, ‘\n‘, ‘ ‘, ‘d‘, ‘\n‘, ‘n‘] # \A與^ # print(re.findall("\Adf", s3)) # [‘df‘] # print(re.findall("\Ad", s3)) # [‘d‘] # print(re.findall("\A當", s3)) # [] # print(re.findall("^df", s3)) # [‘df‘]# print(re.findall("^當", s3)) # [] # s4 = "df當\b 你的眼 睛瞇|著/?sh\n29 dn笑" # \Z $ \z不能用 # print(re.findall("笑\Z", s4)) # [‘笑‘] # print(re.findall("笑$", s4)) # [‘笑‘] # s5 = "s_\t\t\nhe\t哈哈\n\n 愛好 \ru\n" # \n與\t # print(re.findall("\n", s5)) # [‘\n‘, ‘\n‘, ‘\n‘, ‘\n‘] # print(re.findall("\t", s5)) # [‘\t‘, ‘\t‘, ‘\t‘]
重復匹配
# . ? * + {m,n} .* .*? # . 匹配任意字符,除了換行符(加上re.DOTALL這二個參數可以匹配\n) # s1 = "aa bbb aabb acb agb bba babbcb" # print(re.findall("a.b", s1)) # [‘a b‘, ‘aab‘, ‘acb‘, ‘agb‘, ‘a b‘, ‘abb‘] # print(re.findall("aa.b", s1)) """ 匹配邏輯 1. 讀取三個字符 2. 進行匹配 3. 成功則返回這三個字符,並從最後一個字符下一個字符開始匹配 4. 失敗則從第一個字符的下一個字符開始匹配 """ # s2 = "aa babb aabb aaab aaaab bab ba" # ? 匹配0個或多個左邊(單個)字符表達式,滿足貪婪規則 # print(re.findall("a?b", s2)) # [‘b‘, ‘ab‘, ‘b‘, ‘ab‘, ‘b‘, ‘b‘, ‘ab‘, ‘b‘, ‘b‘, ‘b‘] # * 匹配0個或多個左邊(單個)字符的表達式 滿足貪婪規則 # s3 = "aa babb aabb aacb aaab bba ba" # print(re.findall("aa*b", s3)) # [‘ab‘, ‘aab‘, ‘aaab‘] # print(re.findall("a*b", s3)) # [‘b‘, ‘ab‘, ‘b‘, ‘aab‘, ‘b‘, ‘b‘, ‘aaab‘, ‘b‘, ‘b‘, ‘b‘] # + 匹配一個或多個左邊字符的表達式,滿足貪婪規則 # print(re.findall("a+b", s3)) # [‘ab‘, ‘aab‘, ‘aaab‘] # print(re.findall("ab+b", s3)) # [‘abb‘, ‘abb‘] # {m,n} 匹配m個至n個左邊表達式,滿足貪婪規則 # s4 = ‘ab aab aaab aaaaabb‘ # print(re.findall("a{2,4}b", s4)) # [‘aab‘, ‘aaab‘, ‘aaaab‘] # .* 貪婪匹配(盡可能地多),從頭到尾 s5 = "ab aa_b a*()b" # print(re.findall("a.*b", s5)) # [‘ab aa_b a*()b‘] 匹配以a開頭以b結尾的任意長度的字符串 # 上式匹配邏輯:從a開始,找到最後一個b,停止 # print(re.findall("a.*_", s5)) # [‘ab aa_‘] # .*?從頭到尾匹配,非貪婪 # print(re.findall("a.*?b", s5)) # [‘ab‘, ‘aa_b‘, ‘aa*()b‘] # 上式匹配邏輯:從a開始,找到第一個b,停止,繼續下一輪匹配
# []
# [] 括號中可以放任意一個字符 # - 在括號中表示範圍,如果你要匹配上-,那麽這個不能放在中間 # s1 = ‘a1b a3b abb a*b acb a_b‘ # print(re.findall("a[abc]b", s1)) # [‘abb‘, ‘acb‘] # [abc]表示abc中的任意一個字符 # print(re.findall("a[1-9]b", s1)) # [‘a1b‘, ‘a3b‘] # s2 = ‘aAb aWb aeb a*b arb a_b‘ # print(re.findall("a[A-Z]b", s2)) # [‘aAb‘, ‘aWb‘] # print(re.findall("a[a-z]b", s2)) # [‘aeb‘, ‘arb‘] # print(re.findall("a[A-Za-z]b", s2)) # [‘aAb‘, ‘aWb‘, ‘aeb‘, ‘arb‘]
分組
# 分組 # ()制定一個規則,將滿足規則的結果匹配出來 # 練習1:找到s4裏面的hang juan min # s4 = "hang_1 hang_gr juan_1 min_1" # print(re.findall("(.*?)_1", s4)) # [‘hang‘, ‘ hang_gr juan‘, ‘ min‘] # print(re.findall("([a-z]+)_1", s4)) # [‘hang‘, ‘juan‘, ‘min‘] # 分析:都是以字母開頭,以_1結尾,字母可以有多個 # 練習2:找到一個標簽裏的網址 # s5 = ‘<a href="http://www.baidu.com">點擊</a>‘ # print(re.findall(‘href="([a-z].*?)"‘, s5)) # [‘http://www.baidu.com‘] # | 匹配左邊或右邊 # s6 = "hanser:149 yousa:148 mandy:160" # print(re.findall("hanser|yousa|mandy", s6)) # [‘hanser‘, ‘yousa‘, ‘mandy‘] # s7 = ‘Too many companies have gone bankrupt, and the next one is my company‘ # print(re.findall("compan(?:y|ies)", s7)) # [‘companies‘, ‘company‘] # ?:表示將整體匹配出來而不只是()你裏面的內容
3. 常用方法
findall
全部找到並返回一個列表
# 找到下面標簽裏面的網址
import re s1 = ‘<img src="https://pic3.zhimg.com/80/v2-1d1a5e4f422a77372514a57f38503f3e_hd.jpg" data-rawwidth="564" data-rawheight="699" data-size="normal" data-default-watermark-src="https://pic1.zhimg.com/v2-22b99e59d8efc7e7dec3faba8fbf2a24_b.jpg" class="origin_image zh-lightbox-thumb lazy" width="564" data-original="https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_b.jpg">‘ print(re.findall(‘src="([a-z].*?)"‘, s1))
# 結果[‘https://pic3.zhimg.com/80/v2-1d1a5e4f422a77372514a57f38503f3e_hd.jpg‘, ‘https://pic1.zhimg.com/v2-22b99e59d8efc7e7dec3faba8fbf2a24_b.jpg‘, ‘https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_b.jpg‘]
search
找到第一個並返回包含匹配信息的對象,該對象可以通過group()方法得到匹配的字符串,沒找到返回None
s = "Hanser is a little girl in kindergarten" ret = re.search("[A-Z][a-z]*", "Hanser is a little girl in kindergarten") print(ret) # <_sre.SRE_Match object; span=(0, 6), match=‘Hanser‘> print(ret.group()) # Hanser
match
同search,區別在於從字符串開始處進行匹配,可以用search+^代替
s = "Hanser is a little girl in kindergarten" print(re.match("Hanser", s).group()) # Hanser
split
按照指定的分割符分割
s = "Hanser is a little girl in kindergarten" print(re.split(" ", s)) # [‘Hanser‘, ‘is‘, ‘a‘, ‘little‘, ‘girl‘, ‘in‘, ‘kindergarten‘]
s1 = "花褪殘紅青杏小,燕子飛時,綠水人家繞。枝上柳綿吹又少,天涯何處無芳草。" # 方法一: lst = re.split("[,。]", s1) # [‘花褪殘紅青杏小‘, ‘燕子飛時‘, ‘綠水人家繞‘, ‘枝上柳綿吹又少‘, ‘天涯何處無芳草‘, ‘‘] for i in lst: if i: # 過濾空字符 print(i[0]) # 花 燕 綠 枝 天 # 方法二: lst1 = re.findall(r"[,。]([^,。])", s1) print(lst1) # [‘燕‘, ‘綠‘, ‘枝‘, ‘天‘] 只能找到除去開頭的短句首字
sub
替換
s2 = "大家好,我是常山趙子龍" print(re.sub("常山", "石家莊", s2)) # 大家好,我是石家莊趙子龍
compile
# compile 制定一個匹配規則 obj = re.compile("\d{2}") print(obj.search("sdfs14523sdf").group()) # 14 print(obj.findall("sdfs14523sdf")) # [‘14‘, ‘52‘]
finditer
返回一個存放匹配結果的叠代器
ret = re.finditer("\d", "sd283sef8w3o7sh") print(ret) # 叠代器 <callable_iterator object at 0x000001C37F3A9C50> print(next(ret)) # match對象 print(next(ret).group()) # 8 print(next(ret).group()) # 3 print(next(ret).group()) # 8 print([i.group() for i in ret]) # 查看剩余結果
命名分組
# ret = re.search(r"<(?P<tag_name>\w+)>\w+</(?P=tag_name)>", "<h1>hello</h1>") # 在分組中利用?P<name>給分組起名字 # 獲取的匹配結果可以直接用group("名字")拿到對應的值 # print(ret.group()) # <h1>hello</h1> # print(ret.group("tag_name")) # h1 # 如果不給組起名字,也可以用\序號來找到對應的組,獲取的結果可以直接用group(序號)拿到對應的值 # ret = re.search(r"<(\w+)>\w+</\1>", "<h1>hello</h1>") # print(ret.group()) # <h1>hello</h1> # print(ret.group(1)) # h1 # ret = re.findall(r"<(?P<tag_name>\w+)>\w+</(?P=tag_name)>", "<h1>hello</h1>") # print(ret) # [‘h1‘]
python模塊之re模塊