1. 程式人生 > 其它 >Beautiful Soup的使用

Beautiful Soup的使用

Beautiful Soup的使用

1. 基本使用

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)

----------------------
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
The Dormouse's story
  • 變數html中的HTML文字<body><html>節點沒有閉合,從輸出結果來看,Beautiful Soup自動更正了錯誤的HTML格式字串。
  • soup.prettify(),這個方法是將要解析的字串按照縮排的格式進行輸出
  • soup.title.string的意思是,將要解析的字串中title節點中的文字輸出

2. 節點選擇器

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html, 'lxml')
print(soup.title)  # 輸出要解析文字的title節點
print(soup.title.string)  # 輸出要解析文字title節點中的文字
print(type(soup.title))  # 輸出title節點的型別是'bs4.element.Tag'
print(type(soup.title.string))
print(soup.head)  # 輸出要解析文字的head節點
print(soup.p)  # 輸出要解析文字的p節點,有多個p節點,只輸出第一個匹配的p節點
print(soup.a)  # 輸出要解析文字的a節點,有多個a節點,只輸出第一個匹配的a節點
----------------
<title>The Dormouse's story</title>
The Dormouse's story
<class 'bs4.element.Tag'>
<class 'bs4.element.NavigableString'>
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>

3.關聯選擇

3.1 子節點和子孫節點

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
for i in soup.p.contents:   # 找出p節點的所有直接子節點,返回的結果是列表格式,其中span節點沒有單獨選出來
    print(i)
for i, child in enumerate(soup.p.children):  # 通過使用children屬性得到結果,返回的結果是生成器型別
    print(i, child)
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):  # 要獲得所有的子孫節點,可以使用descendants屬性來獲取,返回的結果還是生成器格式,可以發現span節點被單獨選出來了
    print(i, child)

3.2 父節點和祖先節點

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)  # 如果要獲得某個節點元素的父節點,可以使用parent屬性,需要注意的是僅僅輸出了直接父節點的內容
print(soup.a.parents)
print(list(enumerate(soup.a.parents)))  # 使用parents屬性可以獲得一個節點元素的所有祖先節點

3.3 兄弟節點(同級節點)

from bs4 import BeautifulSoup
html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            aaa
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling)  # 屬性next_sibling獲取某個節點元素的下一個同級節點,以第一個a節點為基準
print(soup.a.previous_sibling)  # previous_sibling屬性獲取某個節點元素的上一個同級節點
print(list(enumerate(soup.a.next_siblings)))  # next_siblings屬性獲取某個節點元素後面所有的同級節點
print(list(enumerate(soup.a.previous_siblings)))  # previous_siblings屬性獲取某個節點元素前面所有的同級節點

3.4 提取資訊

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            aaa
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling.string)  # 如果是單個節點,直接可以使用string屬性來獲取節點中的文字
print(soup.a.previous_sibling.string)
print(list(soup.a.parents)[0])  # 如果返回的結果包含多個節點的生成器,現將其轉換成列表,再從中提取元素,然後呼叫string、attrs等屬性來獲取對應的文字和屬性
print(list(soup.a.parents)[0].attrs['class'])

4.方法選擇器

  • find_all

查詢所有符合條件的元素,給find_all傳入一些屬性和文字來得到符合條件的元素

  • name

根據name引數來查詢元素

from bs4 import BeautifulSoup

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name="ul"))  # 呼叫find_all方法查詢name為ul的節點,返回的結果是一個列表
print(type(soup.find_all(name="ul")[0]))  # 通過結果可以發現返回的結果是bs4.element.Tag型別,說明可以繼續巢狀查詢
for ul in soup.find_all(name="ul"):
    print(ul.find_all(name="li"))
    for li in ul.find_all(name="li"):
        print(li.string)
  • attrs
from bs4 import BeautifulSoup

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1" name="elements">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))  # 方法attrs傳入的引數是字典,返回的結果是列表形式
print(soup.find(id="list-1"))  # 對於常見的屬性class,id,可以直接在attrs內使用id="list-1"進行查詢
print(soup.find_all(attrs={'class': 'element'}))
print(soup.find_all(class_="element"))  # 由於class是Python中的關鍵字,因此向attrs傳遞class屬性時需要在class後新增"_"
  • text

text引數可以匹配節點中的文字,傳入形式可以是字串,也可以是正則表示式物件

import re

from bs4 import BeautifulSoup
html = '''
<div class="panel">
    <div class="panel-body">
        <a>Hello, this is a link</a>
        <a>Hello, this is a link, too</a>
        <a>a</a>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link')))  # 使用compile將正則字串編譯成正則物件,傳入字串要與節點中的文字完全相同
print(soup.find_all(text=re.compile(r'\w{2}')))  # 與上一個表示式的輸出結果一樣
  • find

find與find_all的功能差不多,都可以查詢符合條件的元素,但是find返回的只是第一個匹配的元素,而find_all返回的是所有匹配的元素的列表

from bs4 import BeautifulSoup

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''

soup = BeautifulSoup(html, 'lxml')
print(soup.find(class_="element"))  # 返回的第一個class屬性值為element的節點
print(type(soup.find(class_="element")))  # 通過輸出結果可以看出返回的結果是bs4.element.Tag型別的

5.CSS選擇器

在寫CSS的時候,標籤不做修飾id前加#屬性值前加.,在使用CSS選擇器呼叫select語句時使用類似的方法篩選元素,返回結果的型別是列表

from bs4 import BeautifulSoup
html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''

soup = BeautifulSoup(html, 'lxml')
print(soup.select('div'))  # 通過標籤名來查詢
print(soup.select('.element'))  # 通過屬性值來查詢
print(soup.select('#list-2'))  # 通過id值來查詢
print(soup.select('ul li'))  # 查詢ul節點下所有的li節點
print(soup.select('div #list-1'))  # 組合查詢,把要查詢的標籤名、屬性值和id值使用空格分開
#  查詢div標籤下id為list-1的內容
print(soup.select('ul')[1])  # 查詢第二個ul節點下的所有內容
print(type(soup.select('ul')[1]))  # 輸出的列表中元素是bs4.element.Tag型別的

5.1 巢狀查詢

select方法支援巢狀選擇

from bs4 import BeautifulSoup


html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.select('li'))   # 查詢ul節點下所有li節點

5.2 獲取屬性

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul['id'])  # 列印所有ul節點的id值
    print(ul.attrs['id'])

5.3 獲取文字

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
    print(li.get_text())  # 獲取所有li節點的文字,除了使用string方法外,這是第二種方法