lxml和bs4對頁面進行解析
阿新 • • 發佈:2018-12-06
from lxml import etree
from bs4 import BeautifulSoup
html = '''
<div class="f13">
<a target="_blank" href="http://www.baidu.com/link?url=8NwIRNMjHgSZMoBsMFARK9L2W2rAZbunOrnxOnwCcVSJ26Bm5tv5M0dfhcoWioKRvigAqBe_oriwar0_DMx2ldSAemA5mR1yKEhEorCmAyi" class="c-showurl" style="text-decoration:none;">https://www.linkedin.com/jobs/... </a>
<div class="c-tools" id="tools_3067671572010737909_113" data-tools="{"title":"122 個“Ibm”職位 - VIC,Melbourne | 領英 ","url":"http://www.baidu.com/link?url=8NwIRNMjHgSZMoBsMFARK9L2W2rAZbunOrnxOnwCcVSJ26Bm5tv5M0dfhcoWioKRvigAqBe_oriwar0_DMx2ldSAemA5mR1yKEhEorCmAyi"}">
<a class="c-tip-icon"><i class="c-icon c-icon-triangle-down-g"></i></a></div><span class="c-icons-outer"><span class="c-icons-inner">
<span class="c-vline"></span><span class="c-trust-as vstar " data_key="4678554371015554406" hint-data="{"label":"LINKEDINIRELAND","url":"https://www.baidu.com/ [email protected]&vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&vmp_ectm=1541580796&from=vs","hint":[{"txt":"[ecard 109]","vlevel":"3"}]}" hint-type="vstar" render="render">
<a href="https://www.baidu.com/ [email protected]&vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&vmp_ectm=1541580796&from=vs&product=v&rsv_dl=0_left_v_3" class="c-icon c-icon-v c-icon-v3" target="_blank" data-click="{'title':'vstar','rsv_vlevel':'3'}"></a></span></span></span> -
<a data-click="{'rsv_snapshot':'1'}" href="http://cache.baiducontent.com/c?m=9f65cb4a8c8507ed19fa950d100b92235c438014628c8c4c2882c81484642c101a39fee37a7251198895237001d91101bab12172415c77e9cb95cf0a81ec852859cc7c65671df207528a0eaebd0467817dc44de9d941a6edb07087eb8f93895b089a0c&p=8649cd15d9c908f60cbe9b7c54&newp=8e33c64ad49b11a052b9c1124453d8234f08d30e3cd1c44324b9d71fd325001c1b69e3b82127160ed2c17a6c15e9241dbdb239256b5578&user=baidu&fm=sc&query=IBM&qid=b328826100031292&p1=113" target="_blank" class="m">百度快照</a></div>
<div class="f13">
<a target="_blank" href="http://www.baidu.com/link?url=FKsYxTZ8lADvW6bWVlG9IjN3AqtLxcxi_gvrOHsubuqH0wvOYxR-XB42hxE-GQGNwRJOeRzxv92HZjNxCxhz1GzEzt5D5NWiQvrvEaG2kmK" class="c-showurl" style="text-decoration:none;">https://www.linkedin.com/in/fr... </a>
<div class="c-tools" id="tools_11389073726320003270_114" data-tools="{"title":"Fred Balboni - Retired - IBM | 領英 ","url":"http://www.baidu.com/link?url=FKsYxTZ8lADvW6bWVlG9IjN3AqtLxcxi_gvrOHsubuqH0wvOYxR-XB42hxE-GQGNwRJOeRzxv92HZjNxCxhz1GzEzt5D5NWiQvrvEaG2kmK"}">
<a class="c-tip-icon"><i class="c-icon c-icon-triangle-down-g"></i></a></div>
<span class="c-icons-outer"><span class="c-icons-inner"><span class="c-vline"></span>
<span class="c-trust-as vstar " data_key="4678554371015554406" hint-data="{"label":"LINKEDINIRELAND","url":"https://www.baidu.com/ [email protected]&vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&vmp_ectm=1541580796&from=vs","hint":[{"txt":"[ecard 109]","vlevel":"3"}]}" hint-type="vstar" render="render">
<a href="https://www.baidu.com/[email protected]&vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&vmp_ectm=1541580796&from=vs&product=v&rsv_dl=0_left_v_3" class="c-icon c-icon-v c-icon-v3" target="_blank" data-click="{'title':'vstar','rsv_vlevel':'3'}"></a></span></span></span> -
<a data-click="{'rsv_snapshot':'1'}" href="http://cache.baiducontent.com/c?m=9f65cb4a8c8507ed19fa950d100b92235c438014628c8c4c2882c81484642c101a39fee07b3f444484936b6777ff1a02baad6a29200356e798c8884adeb8943567d23034064dda55578e59f9c41d759e&p=cb759a46d6c21dfc57efce665741c6&newp=8e7bc64ad4825aff57ee947f110e88231610db2151d4d1146b82c825d7331b001c3bbfb423261000d3ce7c6201a94a5be8f732763d0923a3dda5c91d9fb4c57479c93c&user=baidu&fm=sc&query=IBM&qid=b328826100031292&p1=114" target="_blank" class="m">百度快照</a></div>
'''
# soup = BeautifulSoup(html,'lxml')
# print(soup.a.string)
html=etree.HTML(html)
urls = html.xpath('//a[contains(text(),".linkedin.com/in/")]')
#取含有linkedin.com/in/的百度快照的連結
for url in urls:
url = url.xpath('@href')
kuaizhao = html.xpath('..//a[contains(text(),"百度快照")]/@href')
print(url[0])
print(kuaizhao[0])