使用BeautifulSoup解析HTML
阿新 • • 發佈:2018-11-19
from bs4 import BeautifulSoup import requests req = requests.get('http://www.iqiyi.com/') ret = req.content.decode('utf-8') # print(ret) # 使用BeautifulSoup解析HTML # soup = BeautifulSoup(ret, 'lxml') soup = BeautifulSoup(ret, 'html.parser') #解析速度快,容錯率高 pret = soup.prettify() # 格式美化 print(pret) body = soup.html.body # 解析HTML # body = soup.html.body.div.div.div.div.div.div.img # 使用string必須保證裡面沒有子標籤 # print(body.string) # 使用strings返回一個可迭代物件,遍歷輸出 content = soup.html.body.strings # children child = soup.html.body.children print(child) # 獲取該標籤下的所有文字 text = body.div.div.text print(text) # 獲取父節點、兄弟節點、前後節點 print(body.div.parent) print(body.div.parents) print(body.div.div.next_sibiling) # find_all 獲取第一個元素的span標籤 print(body.find_all('span')) print(body.find_all(['span', 'a'])) print(body.find_all(class_='title-txt', attrs={'name': 'hahah'})) # find print(body.find(class_='title-txt').text) print(body.find('a')['onfocus']) print(body.find('a').attrs['onfocus']) # 限定查詢的數量 print(body.find_all('a', limit=2)) # url = 'http://www.runoob.com/python/python-100-examples.html' # header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} # req = requests.get(url, headers=header) # soup = BeautifulSoup(req.content.decode('utf-8'), 'html.parser') # print(soup) # content = soup.find(id='content').ul.find_all('a') # for i in content: # print(i['href']) # css選擇器 # 1.通過標籤名查詢 print(soup.select('a')) print(soup.select('#adClick')) # 2.組合獲取 # 前一個是父元素的類選擇器,後一個是子元素的類選擇器 print(soup.select('.nav-list-item .nav-list-link')) # 3.獲取屬性值 for i in soup.select('.nav-list-item .nav-list-link'): print(i['rseat'])