1. 程式人生 > >BS4庫詳解

BS4庫詳解

his 單個 ant lan css選擇器 all ice pre ces

  1 from bs4 import BeautifulSoup
  2 
  3 
  4 
  5 
  6 html = """
  7 <html><head><title>This is a python demo page</title></head>
  8 <body>
  9 <p class="title"><a>The demo python introduces several python courses.</a></p>
 10 <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
11 <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1"><b class="element">Basic Python</b></a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p> 12 </body></html> 13 """ 14 15 soup = BeautifulSoup(html,
lxml) 16 #基本使用 17 # print(soup.prettify()) 18 # print(soup.title.string) 19 20 #標簽選擇器 21 #選擇元素 22 # print(soup.title) 23 # print(type(soup.title)) 24 # print(soup.head) 25 # print(soup.p)#返回第一個Tag 26 # 27 # #獲取名稱 28 # 29 # print(soup.title.name) 30 # 31 # #獲取屬性 32 # print(soup.a.attrs[‘href‘])
33 # print(soup.a[‘href‘]) 34 # 35 # #獲取內容 36 # print(soup.p.string) 37 # 38 # #嵌套選擇 39 # print(soup.head.title.string) 40 41 #子節點和子孫節點 42 # print(soup.body.contents)#獲取子節點,返回列表類型 43 # print(soup.body.children) #返回叠代器類型 44 # for i,child in enumerate(soup.body.children): 45 # print(i,child) 46 47 # print(soup.body.descendants) #子孫節點,返回叠代類型 48 # for i,child in enumerate(soup.body.descendants): 49 # print(i,child) 50 51 #父節點和祖先節點 52 # print(soup.a.parent) 53 # 54 # print(list(enumerate(soup.a.parents))) 55 56 #兄弟節點 57 # print(list(enumerate(soup.a.next_siblings))) 58 # print(list(enumerate(soup.a.previous_siblings))) 59 60 #標準選擇器 61 #find_all(name,attrs,recursive,text,**kwargs) 可根據標簽名、屬性、內容查找文檔 62 #name 63 # print(soup.find_all(‘p‘)) 64 # print(type(soup.find_all(‘p‘)[0])) 65 # for i in soup.find_all(‘p‘): 66 # print(i.find_all(‘a‘)) #嵌套選擇 67 68 # #attrs 69 # print(soup.find_all(attrs={‘href‘:"http://www.icourse163.org/course/BIT-268001"})) 70 # print(soup.find_all(attrs={‘id‘:‘link1‘})) 71 # 72 # print(soup.find_all(id=‘link1‘)) 73 # print(soup.find_all(class_=‘py1‘)) 74 # 75 # #text查找內容 76 # print(soup.find_all(text=‘This is a python demo page‘))#用來做內容匹配 77 # 78 # #find(name,attrs,recursive,text,**kwargs) 79 # #用法一樣,find只是返回單個元素,find_all返回所有元素 80 # print(soup.find(‘p‘,attrs={‘class‘:‘course‘})) 81 # print(type(soup.find(‘p‘))) 82 83 84 #CSS選擇器,返回列表 85 #通過select()直接傳入CSS選擇器即可完成選擇 86 #選擇class屬性就直接用‘.‘代替,例:class=‘course’--》.course;#代表id 87 # print(soup.select(‘.course .py1‘)) 88 # print(soup.select(‘p a‘))#嵌套選擇 89 # print(soup.select(‘#link1 .element‘)) 90 # print(type(soup.select(‘p‘)[0])) 91 # 92 # #嵌套選擇 93 # for p in soup.select(‘p‘): 94 # print(p.select(‘a‘)) 95 # 96 97 #獲取屬性 98 for p in soup.select(p): 99 print(p[class]) 100 print(p.attrs[class]) 101 102 103 #獲取內容 104 for p in soup.select(p): 105 print(p.get_text())

BS4庫詳解