python3爬蟲03(find_all用法等)
阿新 • • 發佈:2018-12-05
#read1.html檔案
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title"><b>The Dormouse's story</b></p>
#
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
#
# <p class="story">...</p></body></html>
#!/usr/bin/env python
# # -*- coding:UTF-8 -*-
import os
import re
import requests
from bs4 import NavigableString
from bs4 import BeautifulSoup
curpath=os.path.dirname(os.path.realpath(__file__))
hmtlpath=os.path.join(curpath,'read1.html')
res=requests.get(hmtlpath)
soup=BeautifulSoup(res.content,features="html.parser")
for str in soup.stripped_strings:
print(repr(str))
links=soup.find_all(class_="sister")
for parent in links.parents:
if parent is None:
print(parent)
else:
print(parent.name)
print(links.next_sibling)
for link in links:
print(link.next_element)
print(link.next_sibling)
print(link.privous_element)
print(link.privous_sibling)
def has_class_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
def not_lacie(href):
return href and not re.compile("lacie").search(href)
def not_tillie(href):
return href and not re.compile("tillie").search(href)
def not_tillie1(id):
return id and not re.compile("link2").search(id)
file=open("soup.html","r",encoding="utf-8")
soup=BeautifulSoup(file,features="lxml")
#find_all用法
tags=soup.find_all(re.compile('^b'))
tags=soup.find_all('b')
tags=soup.find_all(['a','b'])
tags=soup.find_all(has_class_no_id)
tags=soup.find_all(True)
tags=soup.find_all(href=not_lacie)
for tag in tags:
print(tag.name)
def surrounded_by_strings(tag):
return (isinstance(tag.next_element, NavigableString)
and isinstance(tag.previous_element, NavigableString))
tags=soup.find_all(id=not_tillie1)
for tag in tags:
print(tag)
tags=soup.find_all(attrs={"id":"link3"})
for tag in tags:
print(tag)
soup.find_all(recursive=False)
tags=soup.select("body a")
tags=soup.select("p > a")
tags=soup.select("p > #link1")
tags=soup.select("html head title")
tags=soup.select(".sister")
tags=soup.select("[class~=sister]")
tags=soup.select("#link1 + .sister")
tags=soup.select("#link1")
tags=soup.select("a#link1")
tags=soup.select("a[href]")
tags=soup.select('a[href^="http://example"]')
tags=soup.select('a[href$="tillie"]')
tags=soup.select('a[href*=".com/el"]')
for tag in tags:
print(tag)