Python2.7 使用HTMLParser簡單解析HTML
阿新 • • 發佈:2019-01-30
import HTMLParser
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == "li":
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable == "data-cityid":
self.links.append(value)
if __name__ == "__main__":
html_code = """ <ul class="open-context"><li data-cityid="76" data-cityname="廣州" class="open-context-item">廣州</li><li data-cityid="77" data-cityname="深圳" class="open-context-item active">深圳</li><li data-cityid="52" data-cityname="北京" class="open-context-item">北京</li><li data-cityid="321" data-cityname="上海" class="open-context-item">上海</li><li data-cityid="180" data-cityname="武漢" class="open-context-item">武漢</li><li data-cityid="192" data-cityname="咸寧" class="open-context-item">咸寧</li><li data-cityid="220" data-cityname="南京" class="open-context-item">南京</li><li data-cityid="221" data-cityname="蘇州" class="open-context-item">蘇州</li><li data-cityid="222" data-cityname="無錫" class="open-context-item">無錫</li><li data-cityid="235" data-cityname="贛州" class="open-context-item">贛州</li><li data-cityid="244" data-cityname="瀋陽" class="open-context-item">瀋陽</li><li data-cityid="245" data-cityname="大連" class="open-context-item">大連</li><li data-cityid="284" data-cityname="青島" class="open-context-item">青島</li><li data-cityid="322" data-cityname="成都" class="open-context-item">成都</li><li data-cityid="343" data-cityname="天津" class="open-context-item">天津</li><li data-cityid="144" data-cityname="廊坊" class="open-context-item">廊坊</li><li data-cityid="111" data-cityname="貴陽" class="open-context-item">貴陽</li><li data-cityid="53" data-cityname="福州" class="open-context-item">福州</li><li data-cityid="60" data-cityname="廈門" class="open-context-item">廈門</li><li data-cityid="78" data-cityname="潮州" class="open-context-item">潮州</li><li data-cityid="79" data-cityname="東莞" class="open-context-item">東莞</li><li data-cityid="80" data-cityname="佛山" class="open-context-item">佛山</li><li data-cityid="82" data-cityname="惠州" class="open-context-item">惠州</li><li data-cityid="83" data-cityname="江門" class="open-context-item">江門</li><li data-cityid="86" data-cityname="梅州" class="open-context-item">梅州</li><li data-cityid="92" data-cityname="雲浮" class="open-context-item">雲浮</li><li data-cityid="95" data-cityname="中山" class="open-context-item">中山</li><li data-cityid="96" data-cityname="珠海" class="open-context-item">珠海</li><li data-cityid="97" data-cityname="南寧" class="open-context-item">南寧</li><li data-cityid="383" data-cityname="杭州" class="open-context-item">杭州</li></ul>"""
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()
print(hp.links)
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == "li":
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable == "data-cityid":
self.links.append(value)
if __name__ == "__main__":
html_code = """ <ul class="open-context"><li data-cityid="76" data-cityname="廣州" class="open-context-item">廣州</li><li data-cityid="77" data-cityname="深圳" class="open-context-item active">深圳</li><li data-cityid="52" data-cityname="北京" class="open-context-item">北京</li><li data-cityid="321" data-cityname="上海" class="open-context-item">上海</li><li data-cityid="180" data-cityname="武漢" class="open-context-item">武漢</li><li data-cityid="192" data-cityname="咸寧" class="open-context-item">咸寧</li><li data-cityid="220" data-cityname="南京" class="open-context-item">南京</li><li data-cityid="221" data-cityname="蘇州" class="open-context-item">蘇州</li><li data-cityid="222" data-cityname="無錫" class="open-context-item">無錫</li><li data-cityid="235" data-cityname="贛州" class="open-context-item">贛州</li><li data-cityid="244" data-cityname="瀋陽" class="open-context-item">瀋陽</li><li data-cityid="245" data-cityname="大連" class="open-context-item">大連</li><li data-cityid="284" data-cityname="青島" class="open-context-item">青島</li><li data-cityid="322" data-cityname="成都" class="open-context-item">成都</li><li data-cityid="343" data-cityname="天津" class="open-context-item">天津</li><li data-cityid="144" data-cityname="廊坊" class="open-context-item">廊坊</li><li data-cityid="111" data-cityname="貴陽" class="open-context-item">貴陽</li><li data-cityid="53" data-cityname="福州" class="open-context-item">福州</li><li data-cityid="60" data-cityname="廈門" class="open-context-item">廈門</li><li data-cityid="78" data-cityname="潮州" class="open-context-item">潮州</li><li data-cityid="79" data-cityname="東莞" class="open-context-item">東莞</li><li data-cityid="80" data-cityname="佛山" class="open-context-item">佛山</li><li data-cityid="82" data-cityname="惠州" class="open-context-item">惠州</li><li data-cityid="83" data-cityname="江門" class="open-context-item">江門</li><li data-cityid="86" data-cityname="梅州" class="open-context-item">梅州</li><li data-cityid="92" data-cityname="雲浮" class="open-context-item">雲浮</li><li data-cityid="95" data-cityname="中山" class="open-context-item">中山</li><li data-cityid="96" data-cityname="珠海" class="open-context-item">珠海</li><li data-cityid="97" data-cityname="南寧" class="open-context-item">南寧</li><li data-cityid="383" data-cityname="杭州" class="open-context-item">杭州</li></ul>"""
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()
print(hp.links)