1. 程式人生 > >爬蟲入門總結

爬蟲入門總結

int abc html print 過濾 tle soup ali pan

 1 #爬蟲
 2 #自己寫給自己看的,排版不是很好,請不要介意
 3 
 4 import requests
 5 res = requests.get(http://...)#註意是get
 6 res.encoding = utf-8#防治中文亂碼
 7 print(res.text())
 8 #print(type(res))
 9 
10 
11 from bs4 import BeautifulSoup#BeautifulShop 大寫的S
12 html_sample = ...
13 soup = Beautifulsoup(html_sample,html.parser
)#指定過濾器 14 print(soup.text) 15 #使用select找出含有h1標簽的元素 16 header = soup.select(h1) 17 print(header)#[<h1 id="title">Hello World</h1>] 18 print(header[0])#<h1 id="title">Hello World</h1> 19 print(header[0].text)#Hello World 20 #使用select找出含有a標簽的元素 21 alink = soup.select(a) 22 print
(alink)#[<a class="link" href="#">This is link1</a>,<a class="link" href="#">This is link2</a>] 23 for link in alink: 24 print(link.text)#This is link1 This is link2 25 #使用select找出所有id為title的元素(id前需加#) 26 alink = soup.select(#title) 27 print(alink)#[<h1 id=‘title‘>Hello World</h1>]
28 #使用select找出所有class為link的元素(class前需加.) 29 for link in soup.select(.link): 30 print(link)#<a class="link" herf="#">This is link1</a> <a class="link" herf="#">This is link2</a> 31 #使用select找出所有a tag 的herf連結 32 alinks = soup.select(a) 33 for link in alinks: 34 print(link)#<a class="link" herf="# link1">This is link1</a> <a class="link" herf="# link2">This is link2</a> 35 print(link[herf])# # link1 # link2 36 #屬性為字典 37 #範例 38 a = <a href="#" qoo=123 abc=456> This is a link</a> 39 soup = BeautifulSoup(a,html.parser) 40 print(soup.select(a))#[<a href="#‘ qoo=123 abc=456> This is a link</a>] 41 print(soup.select(a)[0])#<a href="#‘ qoo=123 abc=456> This is a link</a> 42 print(soup.select(a)[0][abc])#456

爬蟲入門總結