爬取"最好大學網站"大學排名
阿新 • • 發佈:2019-01-29
上半年在學bs4時的一個爬蟲,主要是針對table標籤的,可用於爬取其他類似網站,程式碼比較好更改
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except :
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
#soup.find('tbody').children是列表迭代型別,而如果是soup.find('tbody')則是標籤
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0 ].string, tds[1].string, tds[3].string])
print(ulist)
def printUnivList(ulist, num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名","學校名稱","總分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
def main() :
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20) # 20 univs
main()