1. 程式人生 > 其它 >python 包之 PyQuery 網頁解析教程

python 包之 PyQuery 網頁解析教程

一、安裝

  • 是一個非常強大又靈活的網頁解析庫

  • PyQuery 是 Python 仿照 jQuery 的嚴格實現

  • 語法與 jQuery 幾乎完全相同,更多操作可以參考jQuery

pip install pyquery

 

二、字串初始化

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))

 

三、url初始化

from pyquery import PyQuery as pq

doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')

 

四、檔案初始化

from pyquery import PyQuery as pq

doc = pq(filename='index.html')
print(doc)

 

五、css選擇器

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
print(doc('#container .fadeIn'))

 

六、查詢子元素

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)

 

七、兄弟元素

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())

 

八、獲取屬性

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)

 

九、獲取文字

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
a = doc('#container .post-content a').text()
print(a)

 

十、類操作

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)