Python爬蟲系列之郵編區號爬取
阿新 • • 發佈:2018-11-11
Python爬蟲之<—>全國郵編區號爬取
- 僅供交流探討
- 歡迎提出改進
程式碼部分
import re
import requests
import time
import MySQLdb
'''
@author:王磊
@time :2018/11/8 21:15:05
'''
cursor = MySQLdb.connect(user='root', password='root', database='python', charset='utf8').cursor()
def getHTML(url):
'''通過url獲取html' ''
res = requests.get(url)
return res.content.decode(res.apparent_encoding, 'ignore')
def getPrivince(html):
req1 = re.compile(r'<td><a href="(.*?)" target="_blank">.*?</a></td>')
res = re.findall(req1, html)
nowIndex = 0
for res0 in res:
if 'http' in res0:
res.pop(nowIndex)
else:
res[nowIndex] = "http://www.ip138.com" + res0
nowIndex += 1
return res[0:-2]
def getCity(html):
req1 = re.compile(r'<tr bgcolor="#ffffff"><td><a href=".*?"><b>(.*?)</a></b></td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>' )
citys = re.findall(req1, html)
req2 = re.compile(r'<td>(.*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
req3 = re.compile(r'<td>([^<a|^ ].*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
if citys:
'''省區帶市區部分'''
area = re.findall(req3, html)
if area:
'''區存在'''
return citys + area
else:
return citys
else:
'''直轄部分,只有區'''
citys = re.findall(req2, html)
return citys
def run():
urlIndex = 'http://www.ip138.com/post/'
indexHtml = getHTML(urlIndex)
provinceUrls = getPrivince(indexHtml)
with open('c:/Users/asus/Desktop/pc/text/zipCode.txt', 'a') as f:
for provinceUrl in provinceUrls:
provinceHtml = getHTML(provinceUrl)
citys = getCity(provinceHtml)
for city in citys:
try:
wStr = "地區:%s ,郵編:%s ,區號:%s \r\n" % city
f.write(wStr)
cursor.execute("insert city_zip_code values('%s', '%s', '%s')" % city)
except Exception as e:
pass
time.sleep(2)
if __name__ == '__main__':
run()
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪後續會更新系列基於Python的爬蟲小例子,歡迎關注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪