python之xpath爬蟲
阿新 • • 發佈:2019-02-04
<span style="font-size:18px;">#coding=utf-8 from lxml import etree import sys import chardet import codecs reload(sys) sys.setdefaultencoding("utf-8") html=u''' <!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <title>測試常用規範</title> </head> <body> <div id="content"> <ul id="useful"> <li>546545645645645</li> <li>這是第一條資訊</li> <li>這是第一條資訊</li> </ul> <ul id="useless"> <li>不需要資訊</li> <li>不需要資訊</li> <li>不需要資訊</li> </ul> <div> <a href="http://lotluck.com">lotluck專欄</a> <a href="http://lotluck/68525233653.com" title=杜帥加油啊"">linux自學之旅</a> </div> </div> </body> </html> ''' print html selector = etree.HTML(html) content = selector.xpath('//ul[@id="useful"]/li/text()') print type(content) for each in content: print each ##提取屬性 link = selector.xpath('//a/@href') for each in link: print each print '開始' title = selector.xpath('//a/@title') f = open('5.txt','w') f.write(title[0]) f.close() f1 = open('5.txt','r') fencoding = chardet.detect(f1.read()) print fencoding print '結束' print type(title[0]) #print title[0].decode('ascii').encode('utf-8') print "我們是小青哇" f3=codecs.open("6.htm","r","utf-8") content1 = f3.read() f3.close() tree=etree.HTML(content1) node= tree.xpath('//ul[@id="useful"]/li/text()') for each in node: print each </span>