python抓取國家統計局省市區街道社群資料
阿新 • • 發佈:2018-12-21
一、說明
在網上百度下載省市區等資料,不是要積分下載就是要錢,我等窮逼既無積分又無錢,那就只有另想辦法,學過幾天python,用python將就抓點資料,借鑑別人一些寫法,再修修補補,除錯bug,基本上可以執行,並將抓取的資料儲存至MySQL資料庫中(抓取之前換成自己的資料庫配置)。開發環境python3.6,下面附上原始碼,對python略懂基本,程式碼寫得不好,如有更好的抓取方式和寫法,歡迎指導。
二、原始碼
#!/usr/bin/python3 # -*- coding: utf-8 -*- # author=He import sys import os import re from urllib import request from bs4 import BeautifulSoup import pymysql import requests # 請求頁面 def http_request(url, charset='utf8'): try: print(url) data = requests.get(url, headers=header, timeout=50) kk = data.content kk = kk.decode(charset) except Exception as e: print(e) kk = http_request(url, charset) return kk # 獲取全國省份和直轄市 def province(): print('爬取省資料中。。。') t = http_request(url, 'gbk') province_list = {} if t: soup = BeautifulSoup(t, 'html.parser') for i in soup.find_all(attrs={'class': 'provincetr'}): for a in i.find_all('a'): id = re.sub("\D", "", a.get('href')) province_list[id] = {'id': id, 'name': a.text, 'code': id, 'href': url + a.get('href')} # exit(province_list) insert_data('province', province_list) return province_list # 獲取省下級市 def city(province_list): print('爬取市資料中。。。') city_list = {} for i in province_list: t = http_request(province_list[i]['href'], 'gbk') parent_href = get_parent_url(province_list[i]['href']) if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'citytr'}): id = str(v.find_all('td')[0].text) city_href = str(v.find_all('td')[1].find_all('a')[0].get('href')) city_list[id[0:4]] = {'id': id[0:4], 'name': str(v.find_all('td')[1].text), 'province_id': i, 'code': id, 'href': parent_href + city_href} insert_data('city', city_list) return city_list # 獲取市下級區縣 def country(city_list): print('爬取區縣資料中。。。') county_list = {} for i in city_list: t = http_request(city_list[i]['href'], 'gbk') parent_href = get_parent_url(city_list[i]['href']) if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'countytr'}): id = str(v.find_all('td')[0].text) if len(v.find_all('td')[1].find_all('a')): country_href = str(v.find_all('td')[1].find_all('a')[0].get('href')) else: continue county_list[id[0:6]] = {'id': id[0:6], 'name': str(v.find_all('td')[1].text), 'city_id': i, 'code': id, 'href': parent_href + country_href} insert_data('country', county_list) return county_list # 縣下級街道、鎮 def street(county_list): print('爬取街道資料中。。。') street_list = {} for i in county_list: t = http_request(county_list[i]['href'], 'gbk') parent_href = get_parent_url(county_list[i]['href']) if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'towntr'}): id = str(v.find_all('td')[0].text) street_href = str(v.find_all('td')[1].find_all('a')[0].get('href')) street_list[id[0:9]] = {'id': id[0:9], 'name': str(v.find_all('td')[1].text), 'country_id': i, 'code': id, 'href': parent_href + street_href} insert_data('street', street_list) return street_list # 社群、街道辦 def community(street_list): print('爬取社群資料中。。。') community_list = {} for i in street_list: t = http_request(street_list[i]['href'], 'gbk') if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'villagetr'}): id = str(v.find_all('td')[0].text) community_list[id[0:12]] = {'id': id[0:12], 'name': str(v.find_all('td')[2].text), 'street_id': i, 'code': id, 'category': str(v.find_all('td')[1].text)} insert_data('community', community_list) return community_list # 獲取上級url def get_parent_url(href): # print('上級連結:' + href) arr = href.split('/') last_value = arr[len(arr) - 1] href = href.replace(last_value, '') return href # 插入資料 def insert_data(table, data): if len(data) == 0: return False keys = list(data.keys()) table_keys = '`'+'`,`'.join(list(data[keys[0]].keys()))+'`' for k in range(0, len(keys)): table_values = '\''+'\',\''.join(list(data[keys[int(k)]].values()))+'\'' sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, table_keys, table_values) # exit(sql) try: cursor.execute(sql) except Exception as e: print(e) conn.commit() if __name__ == '__main__': conn = pymysql.connect(host='192.168.1.11', port=3306, user='****', passwd='****', db='db_me', charset='utf8') cursor = conn.cursor() sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' header = { 'Cookie': 'AD_RS_COOKIE=20181108', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ ' 'Chrome/58.0.3029.110 Safari/537.36'} provinceList = province() cityList = city(provinceList) countryList = country(cityList) streetList = street(countryList) communityList = community(streetList) print('資料抓取完成')