1. 程式人生 > >2019.1.7

2019.1.7

import urllib.request
import urllib.error
import re
data=urllib.request.urlopen("http://bbs.hupu.com/").read()
data=data.decode("utf-8","ignore")
pat='<a href="(.*?.html)" target="_blank" title='
allurl=re.compile(pat).findall(data)
for i in range(0,len(allurl)):
    allurl[i]='https://bbs.hupu.com/'+allurl[i]
fh
=open('./result.txt','a',encoding='utf8') for i in range(0,len(allurl)): try: nowurl=allurl[i] print('正在爬取第'+str(i+1)+'個帖子') print(nowurl) data=urllib.request.urlopen(nowurl).read() data=data.decode("utf-8","ignore") pat='<title>\n(.*?)\n</title>
' result=re.compile(pat).findall(data) fh.write(result[0]+'\n') print('----列印成功----') except urllib.error.URLError as e: print('爬取第' + str(i + 1) + '個帖子失敗') if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) fh.close()