python 包之 PyQuery 網頁解析教程
阿新 • • 發佈:2022-04-22
一、安裝
-
是一個非常強大又靈活的網頁解析庫
-
PyQuery 是 Python 仿照 jQuery 的嚴格實現
-
語法與 jQuery 幾乎完全相同,更多操作可以參考jQuery
pip install pyquery
二、字串初始化
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc) print(type(doc)) print(doc('li'))
三、url初始化
from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')
四、檔案初始化
from pyquery import PyQuery as pq
doc = pq(filename='index.html')
print(doc)
五、css選擇器
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .fadeIn'))
六、查詢子元素
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('#container') lis = items.find('li') print(type(lis)) print(lis)
七、兄弟元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())
八、獲取屬性
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)
九、獲取文字
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a').text()
print(a)
十、類操作
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)