Python簡單爬取網頁_黃海鋒
阿新 • • 發佈:2022-03-20
from urllib import request import re page=100 url="https://tieba.baidu.com/f?kw=%B6%CE%D7%D3&fr=ala0&tpl=5&dyTabStr=MCw2LDIsNCw1LDMsMSw4LDcsOQ%3D%3D"+str(page) try: headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"} req = request.Request(url,headers=headers) resp = request.urlopen(req) content = resp.read().decode('utf-8') print(content) #定義正則 # pattern = re.compile(r'<title>(.*)</title>') #<a rel="noopener"相當於要找的東西的一個標識 #.*? 是想要的內容當中不需要的內容 #(.*?) 是需要的內容 pattern = re.compile(r'<a rel="noopener".*?title=(.*?)\s.*?>(.*?)</a>') #匹配html items = re.findall(pattern,content) for i in items: print('標題:'+i[0]+'內容:'+i[1]) # print(i) except request.URLError as e: if hasattr(e,'code'): print(e.code) if hasattr(e,'reason'): print(e.reason)