1. 程式人生 > >lxml和bs4對頁面進行解析

lxml和bs4對頁面進行解析

from lxml import etree

from bs4 import BeautifulSoup
html = '''
<div class="f13">
<a target="_blank" href="http://www.baidu.com/link?url=8NwIRNMjHgSZMoBsMFARK9L2W2rAZbunOrnxOnwCcVSJ26Bm5tv5M0dfhcoWioKRvigAqBe_oriwar0_DMx2ldSAemA5mR1yKEhEorCmAyi" class="c-showurl" style="text-decoration:none;">https://www.linkedin.com/jobs/...&nbsp;</a>
<div class="c-tools" id="tools_3067671572010737909_113" data-tools="{&quot;title&quot;:&quot;122 個“Ibm”職位 - VIC,Melbourne | 領英 &quot;,&quot;url&quot;:&quot;http://www.baidu.com/link?url=8NwIRNMjHgSZMoBsMFARK9L2W2rAZbunOrnxOnwCcVSJ26Bm5tv5M0dfhcoWioKRvigAqBe_oriwar0_DMx2ldSAemA5mR1yKEhEorCmAyi&quot;}">
<a class="c-tip-icon"><i class="c-icon c-icon-triangle-down-g"></i></a></div><span class="c-icons-outer"><span class="c-icons-inner">
<span class="c-vline"></span><span class="c-trust-as vstar " data_key="4678554371015554406" hint-data="{&quot;label&quot;:&quot;LINKEDINIRELAND&quot;,&quot;url&quot;:&quot;https://www.baidu.com/
[email protected]
&amp;vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&amp;vmp_ectm=1541580796&amp;from=vs&quot;,&quot;hint&quot;:[{&quot;txt&quot;:&quot;[ecard 109]&quot;,&quot;vlevel&quot;:&quot;3&quot;}]}" hint-type="vstar" render="render"> <a href="https://www.baidu.com/
[email protected]
&amp;vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&amp;vmp_ectm=1541580796&amp;from=vs&amp;product=v&amp;rsv_dl=0_left_v_3" class="c-icon c-icon-v c-icon-v3" target="_blank" data-click="{'title':'vstar','rsv_vlevel':'3'}"></a></span></span></span>&nbsp;-&nbsp; <a data-click="{'rsv_snapshot':'1'}" href="http://cache.baiducontent.com/c?m=9f65cb4a8c8507ed19fa950d100b92235c438014628c8c4c2882c81484642c101a39fee37a7251198895237001d91101bab12172415c77e9cb95cf0a81ec852859cc7c65671df207528a0eaebd0467817dc44de9d941a6edb07087eb8f93895b089a0c&amp;p=8649cd15d9c908f60cbe9b7c54&amp;newp=8e33c64ad49b11a052b9c1124453d8234f08d30e3cd1c44324b9d71fd325001c1b69e3b82127160ed2c17a6c15e9241dbdb239256b5578&amp;user=baidu&amp;fm=sc&amp;query=IBM&amp;qid=b328826100031292&amp;p1=113" target="_blank" class="m">百度快照</a></div> <div class="f13"> <a target="_blank" href="http://www.baidu.com/link?url=FKsYxTZ8lADvW6bWVlG9IjN3AqtLxcxi_gvrOHsubuqH0wvOYxR-XB42hxE-GQGNwRJOeRzxv92HZjNxCxhz1GzEzt5D5NWiQvrvEaG2kmK" class="c-showurl" style="text-decoration:none;">https://www.linkedin.com/in/fr...&nbsp;</a> <div class="c-tools" id="tools_11389073726320003270_114" data-tools="{&quot;title&quot;:&quot;Fred Balboni - Retired - IBM | 領英 &quot;,&quot;url&quot;:&quot;http://www.baidu.com/link?url=FKsYxTZ8lADvW6bWVlG9IjN3AqtLxcxi_gvrOHsubuqH0wvOYxR-XB42hxE-GQGNwRJOeRzxv92HZjNxCxhz1GzEzt5D5NWiQvrvEaG2kmK&quot;}"> <a class="c-tip-icon"><i class="c-icon c-icon-triangle-down-g"></i></a></div> <span class="c-icons-outer"><span class="c-icons-inner"><span class="c-vline"></span> <span class="c-trust-as vstar " data_key="4678554371015554406" hint-data="{&quot;label&quot;:&quot;LINKEDINIRELAND&quot;,&quot;url&quot;:&quot;https://www.baidu.com/
[email protected]
&amp;vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&amp;vmp_ectm=1541580796&amp;from=vs&quot;,&quot;hint&quot;:[{&quot;txt&quot;:&quot;[ecard 109]&quot;,&quot;vlevel&quot;:&quot;3&quot;}]}" hint-type="vstar" render="render"> <a href="https://www.baidu.com/[email protected]&amp;vmp_ec=cd88488d93644919848a6666cf250p3b8d23Xbab4803aa037=584Wj5ibyasReZ3r5WXda0dd33sJ2c7e11d802&amp;vmp_ectm=1541580796&amp;from=vs&amp;product=v&amp;rsv_dl=0_left_v_3" class="c-icon c-icon-v c-icon-v3" target="_blank" data-click="{'title':'vstar','rsv_vlevel':'3'}"></a></span></span></span>&nbsp;-&nbsp; <a data-click="{'rsv_snapshot':'1'}" href="http://cache.baiducontent.com/c?m=9f65cb4a8c8507ed19fa950d100b92235c438014628c8c4c2882c81484642c101a39fee07b3f444484936b6777ff1a02baad6a29200356e798c8884adeb8943567d23034064dda55578e59f9c41d759e&amp;p=cb759a46d6c21dfc57efce665741c6&amp;newp=8e7bc64ad4825aff57ee947f110e88231610db2151d4d1146b82c825d7331b001c3bbfb423261000d3ce7c6201a94a5be8f732763d0923a3dda5c91d9fb4c57479c93c&amp;user=baidu&amp;fm=sc&amp;query=IBM&amp;qid=b328826100031292&amp;p1=114" target="_blank" class="m">百度快照</a></div> ''' # soup = BeautifulSoup(html,'lxml') # print(soup.a.string) html=etree.HTML(html) urls = html.xpath('//a[contains(text(),".linkedin.com/in/")]') #取含有linkedin.com/in/的百度快照的連結 for url in urls: url = url.xpath('@href') kuaizhao = html.xpath('..//a[contains(text(),"百度快照")]/@href') print(url[0]) print(kuaizhao[0])