python 爬蟲總結(一)
阿新 • • 發佈:2019-01-27
__author__ = 'fen' # coding=utf8 import urllib2 import urllib from StringIO import StringIO import bs4 def base1(url): content=urllib2.urlopen(url).read return content def agent(url): proxy_support=urllib2.ProxyHandler({'http':url}) #通過代理重定向請求 opener=urllib2.build_opener(proxy_support,urllib2.HTTPHandler) urllib2.install_opener(opener) content=urllib2.urlopen(url).read() #新增頭資訊,模仿瀏覽器抓取網頁,對付返回403禁止訪問的問題 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} req = urllib2.Request(url,headers=i_headers) html = urllib2.urlopen(req) if url == html.geturl(): html = html.read() return html returncontent def para1(url,page): import requests # 用'?'的post header_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0' #頭資訊可以修改成多種 headers = {'User-Agent':header_agent} #某些網站反感爬蟲的到訪,於是對爬蟲一律拒絕請求。這時候我們需要偽裝成瀏覽器,這可以通過修改http包中的header pqyload={'curr_Page':page} # curr_Page,condition.pageNo ?後面連線的引數 r=requests.get(url,params=pqyload) html=requests.get(url,headers = headers,params=pqyload).text return html def para2(url,page,rn1,rn2): #url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list' header_agent = "Mozilla/5."+str(rn1)+"(X11; Ubuntu; Linux x86_32; rv:37.0) Gecko/20100101 Firefox/36."+str(rn2) headers = {'User-Agent':header_agent} #將header資訊隨機生成,以免訪問受阻 values={ 'random':'1440940998226', 'cxyzm':'no', 'page.currentPageNo':str(page), } # &的情況 data=urllib.urlencode(values) #進行引數封裝 req=urllib2.Request(url,data,headers=headers) req.add_header('Accept-encoding', 'gzip') response = urllib2.urlopen(req) html =StringIO(response.read()) #原始碼有可能被壓縮 通過這個可以看到更真實的原始碼 html=bs4.BeautifulSoup(html) #bs 自動編碼,也可以通過一下獲取原始碼編碼方式,下面的方法較慢 # charset=chardet.detect(html) # code=charset['encoding']#獲取原始碼的編碼方式 # text=str(html).decode(code,'ignore').encode('utf-8') html=str(html) #用到bs包的 先將強制轉換成str return html #print agent('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action?curr_Page=2') #print para1('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action',2) #print para2(url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list',page=2,rn1=3,rn2=2)