Python爬資料之全國中小學資訊
阿新 • • 發佈:2019-01-09
技術路線:requests + BeautifulSoup
貌似這個網站反爬蟲還挺牛的,經常就返回自動跳入的139網站,隨意得換著IP試試
需要準備中國市名稱拼音存在EXCEL中,顯示是第一列:市民;第二列:拼音;到市級就可以。
需要挖掘哪些城市就放哪些,如果挖全國,就要放所有市名。
如:
輸出是一個EXCEL,包括:
城市 | 型別 | 學習名稱 | 地址 | 電話 | 網址 |
如:
直接上程式碼:
from bs4 import BeautifulSoup import requests import re import sys import xlwt import xlrd from xlutils.copy import copy #獲取html def getHtmlText(url, code="GBK"): try: headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'} r = requests.get(url,headers = headers,timeout = 30) r.raise_for_status() r.encoding = code return r.text except: return "獲取html異常" #解析地區,返回地區清單 ''' def getGroundList(htext): try: grounddict = {} soup = BeautifulSoup(htext, "html.parser") gdname = soup.find('dl', attrs={'class':'nobackground'}) keyList = gdname.find_all('a') for i in range(1,len(keyList)): key = keyList[i].text val = keyList[i].get('href') grounddict[key] = val return grounddict except: print("getGroundList異常") ''' #解析頁碼 def getPageCode(htext,typeitem): try: soup = BeautifulSoup(htext, "html.parser") s1 = soup.find('a', attrs={'class':'last'}) if (s1): pat = re.compile(typeitem + r'pn([0-9]+).html') if(s1.get('href')): code = pat.search(s1.get('href')) if(code): return code.group(1) else: return 0 except: print("getPageCode異常") #解析學校資訊,返回學校名稱、地址、電話、網址 def getSchoolList(htext,fileAddress,cityitem,typeitem): try: schoolDict = {} soup = BeautifulSoup(htext, "html.parser") sclist1 = soup.find_all('dl',attrs={'class':'left'}) sclist2 = soup.find_all('dl',attrs={'class':'right'}) sclist = sclist1 + sclist2 for item in sclist: schoolDict['城市'] = cityitem schoolDict['型別'] = typeitem schoolDict['學習名稱'] = item.find('p').text sl = item.find_all('li') schoolDict['地址'] = sl[0].text schoolDict['電話'] = sl[1].text schoolDict['網址'] = sl[2].text #f = open(fileAddress, 'a', encoding='utf-8') #f.write(str(schoolDict) + '\n' ) savefile(schoolDict,fileAddress) except: print("getSchoolList異常") #儲存到excel def savefile(schoolDict,fileAddress): workbook = xlrd.open_workbook(fileAddress,'w+b') sheet = workbook.sheet_by_index(0) wb = copy(workbook) ws = wb.get_sheet(0) rowNum = sheet.nrows ws.write(rowNum,0,schoolDict['城市']) ws.write(rowNum,1,schoolDict['型別']) ws.write(rowNum,2,schoolDict['學習名稱']) ws.write(rowNum,3,schoolDict['地址']) ws.write(rowNum,4,schoolDict['電話']) ws.write(rowNum,5,schoolDict['網址']) wb.save(fileAddress) #獲取城市列表,城市由EXCEL檔案儲存 def getCityList(): try: cityFileAddress = r'D:\中國省市名稱拼音.xls' file = xlrd.open_workbook(cityFileAddress) sheet = file.sheet_by_name('city') cityDic = {} for i in range(sheet.nrows): key = sheet.col_values(0)[i] value = sheet.col_values(1)[i].lower() cityDic[key] = value return cityDic except: print("getCityList失敗") def main(): cityList = getCityList() typeList = {'小學':'/xiaoxue/','初中':'/chuzhong/','高中':'/gaozhong/'} for cityitem in cityList: for typeitem in typeList: searchUrl = 'http://'+ cityList[cityitem] + '.xuexiaodaquan.com' fileAddress = 'D:/school.xls' htext = getHtmlText(searchUrl+typeList[typeitem]) getSchoolList(htext,fileAddress,cityitem,typeitem) pagecode = int(getPageCode(htext,typeList[typeitem])) if pagecode != 0: for i in range(2,pagecode+1): h1text = getHtmlText(searchUrl+typeList[typeitem]+'pn'+str(i)+'.html') getSchoolList(h1text,fileAddress,cityitem,typeitem) main()