Beautiful Soup的使用
阿新 • • 發佈:2021-12-17
Beautiful Soup的使用
1. 基本使用
from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html, 'lxml') print(soup.prettify()) print(soup.title.string) ---------------------- <html> <head> <title> The Dormouse's story </title> </head> <body> <p class="title" name="dromouse"> <b> The Dormouse's story </b> </p> <p class="story"> Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1"> <!-- Elsie --> </a> , <a class="sister" href="http://example.com/lacie" id="link2"> Lacie </a> and <a class="sister" href="http://example.com/tillie" id="link3"> Tillie </a> ; and they lived at the bottom of a well. </p> <p class="story"> ... </p> </body> </html> The Dormouse's story
- 變數html中的HTML文字
<body>
和<html>
節點沒有閉合,從輸出結果來看,Beautiful Soup自動更正了錯誤的HTML格式字串。 soup.prettify()
,這個方法是將要解析的字串按照縮排的格式進行輸出
soup.title.string
的意思是,將要解析的字串中title節點中的文字輸出
2. 節點選擇器
from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html, 'lxml') print(soup.title) # 輸出要解析文字的title節點 print(soup.title.string) # 輸出要解析文字title節點中的文字 print(type(soup.title)) # 輸出title節點的型別是'bs4.element.Tag' print(type(soup.title.string)) print(soup.head) # 輸出要解析文字的head節點 print(soup.p) # 輸出要解析文字的p節點,有多個p節點,只輸出第一個匹配的p節點 print(soup.a) # 輸出要解析文字的a節點,有多個a節點,只輸出第一個匹配的a節點 ---------------- <title>The Dormouse's story</title> The Dormouse's story <class 'bs4.element.Tag'> <class 'bs4.element.NavigableString'> <head><title>The Dormouse's story</title></head> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
3.關聯選擇
3.1 子節點和子孫節點
from bs4 import BeautifulSoup html = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ soup = BeautifulSoup(html, 'lxml') for i in soup.p.contents: # 找出p節點的所有直接子節點,返回的結果是列表格式,其中span節點沒有單獨選出來 print(i) for i, child in enumerate(soup.p.children): # 通過使用children屬性得到結果,返回的結果是生成器型別 print(i, child) print(soup.p.descendants) for i, child in enumerate(soup.p.descendants): # 要獲得所有的子孫節點,可以使用descendants屬性來獲取,返回的結果還是生成器格式,可以發現span節點被單獨選出來了 print(i, child)
3.2 父節點和祖先節點
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent) # 如果要獲得某個節點元素的父節點,可以使用parent屬性,需要注意的是僅僅輸出了直接父節點的內容
print(soup.a.parents)
print(list(enumerate(soup.a.parents))) # 使用parents屬性可以獲得一個節點元素的所有祖先節點
3.3 兄弟節點(同級節點)
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
aaa
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling) # 屬性next_sibling獲取某個節點元素的下一個同級節點,以第一個a節點為基準
print(soup.a.previous_sibling) # previous_sibling屬性獲取某個節點元素的上一個同級節點
print(list(enumerate(soup.a.next_siblings))) # next_siblings屬性獲取某個節點元素後面所有的同級節點
print(list(enumerate(soup.a.previous_siblings))) # previous_siblings屬性獲取某個節點元素前面所有的同級節點
3.4 提取資訊
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
aaa
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling.string) # 如果是單個節點,直接可以使用string屬性來獲取節點中的文字
print(soup.a.previous_sibling.string)
print(list(soup.a.parents)[0]) # 如果返回的結果包含多個節點的生成器,現將其轉換成列表,再從中提取元素,然後呼叫string、attrs等屬性來獲取對應的文字和屬性
print(list(soup.a.parents)[0].attrs['class'])
4.方法選擇器
- find_all
查詢所有符合條件的元素,給find_all傳入一些屬性和文字來得到符合條件的元素
- name
根據name引數來查詢元素
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name="ul")) # 呼叫find_all方法查詢name為ul的節點,返回的結果是一個列表
print(type(soup.find_all(name="ul")[0])) # 通過結果可以發現返回的結果是bs4.element.Tag型別,說明可以繼續巢狀查詢
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
- attrs
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'})) # 方法attrs傳入的引數是字典,返回的結果是列表形式
print(soup.find(id="list-1")) # 對於常見的屬性class,id,可以直接在attrs內使用id="list-1"進行查詢
print(soup.find_all(attrs={'class': 'element'}))
print(soup.find_all(class_="element")) # 由於class是Python中的關鍵字,因此向attrs傳遞class屬性時需要在class後新增"_"
- text
text引數可以匹配節點中的文字,傳入形式可以是字串,也可以是正則表示式物件
import re
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-body">
<a>Hello, this is a link</a>
<a>Hello, this is a link, too</a>
<a>a</a>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link'))) # 使用compile將正則字串編譯成正則物件,傳入字串要與節點中的文字完全相同
print(soup.find_all(text=re.compile(r'\w{2}'))) # 與上一個表示式的輸出結果一樣
- find
find與find_all的功能差不多,都可以查詢符合條件的元素,但是find返回的只是第一個匹配的元素
,而find_all返回的是所有匹配的元素的列表
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find(class_="element")) # 返回的第一個class屬性值為element的節點
print(type(soup.find(class_="element"))) # 通過輸出結果可以看出返回的結果是bs4.element.Tag型別的
5.CSS選擇器
在寫CSS的時候,標籤不做修飾
,id前加#
,屬性值前加.
,在使用CSS選擇器呼叫select語句時使用類似的方法篩選元素,返回結果的型別是列表
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('div')) # 通過標籤名來查詢
print(soup.select('.element')) # 通過屬性值來查詢
print(soup.select('#list-2')) # 通過id值來查詢
print(soup.select('ul li')) # 查詢ul節點下所有的li節點
print(soup.select('div #list-1')) # 組合查詢,把要查詢的標籤名、屬性值和id值使用空格分開
# 查詢div標籤下id為list-1的內容
print(soup.select('ul')[1]) # 查詢第二個ul節點下的所有內容
print(type(soup.select('ul')[1])) # 輸出的列表中元素是bs4.element.Tag型別的
5.1 巢狀查詢
select方法支援巢狀選擇
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li')) # 查詢ul節點下所有li節點
5.2 獲取屬性
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id']) # 列印所有ul節點的id值
print(ul.attrs['id'])
5.3 獲取文字
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print(li.get_text()) # 獲取所有li節點的文字,除了使用string方法外,這是第二種方法