1. 程式人生 > >python爬取連結去重

python爬取連結去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
	global pages
	html = urlopen("http://en.wikipedia.org"+pageUrl)
	bsObj = BeautifulSoup(html)
	for link  in bsObj.findAll("a",href = re.compile("^(/wiki/)")):
		if 'href' in link.attrs:
			if link.attrs['href']  not in pages:
				#遇到新的頁面
			    newPage = link.attrs['href']
			    print(newPage)
			    pages.add(newPage)
			    getLinks(newPage)
getLinks("")