python抓取國家統計局省市區街道社群資料

阿新 • • 發佈：2018-12-21

一、說明

在網上百度下載省市區等資料，不是要積分下載就是要錢，我等窮逼既無積分又無錢，那就只有另想辦法，學過幾天python，用python將就抓點資料，借鑑別人一些寫法，再修修補補，除錯bug，基本上可以執行，並將抓取的資料儲存至MySQL資料庫中（抓取之前換成自己的資料庫配置）。開發環境python3.6，下面附上原始碼，對python略懂基本，程式碼寫得不好，如有更好的抓取方式和寫法，歡迎指導。

二、原始碼

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He


import sys
import os
import re
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import requests


# 請求頁面
def http_request(url, charset='utf8'):
    try:
        print(url)
        data = requests.get(url, headers=header, timeout=50)
        kk = data.content
        kk = kk.decode(charset)

    except Exception as e:
        print(e)
        kk = http_request(url, charset)

    return kk


# 獲取全國省份和直轄市
def province():
    print('爬取省資料中。。。')
    t = http_request(url, 'gbk')
    province_list = {}

    if t:
        soup = BeautifulSoup(t, 'html.parser')
        for i in soup.find_all(attrs={'class': 'provincetr'}):
            for a in i.find_all('a'):
                id = re.sub("\D", "", a.get('href'))
                province_list[id] = {'id': id, 'name': a.text, 'code': id, 'href': url + a.get('href')}
    # exit(province_list)
    insert_data('province', province_list)
    return province_list


# 獲取省下級市
def city(province_list):
    print('爬取市資料中。。。')
    city_list = {}

    for i in province_list:
        t = http_request(province_list[i]['href'], 'gbk')
        parent_href = get_parent_url(province_list[i]['href'])

        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'citytr'}):
            id = str(v.find_all('td')[0].text)
            city_href = str(v.find_all('td')[1].find_all('a')[0].get('href'))

            city_list[id[0:4]] = {'id': id[0:4], 'name': str(v.find_all('td')[1].text), 'province_id': i, 'code': id,
                                  'href': parent_href + city_href}

    insert_data('city', city_list)
    return city_list


# 獲取市下級區縣
def country(city_list):
    print('爬取區縣資料中。。。')
    county_list = {}

    for i in city_list:
        t = http_request(city_list[i]['href'], 'gbk')
        parent_href = get_parent_url(city_list[i]['href'])

        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'countytr'}):
            id = str(v.find_all('td')[0].text)

            if len(v.find_all('td')[1].find_all('a')):
                country_href = str(v.find_all('td')[1].find_all('a')[0].get('href'))
            else:
                continue

            county_list[id[0:6]] = {'id': id[0:6], 'name': str(v.find_all('td')[1].text), 'city_id': i, 'code': id,
                                    'href': parent_href + country_href}

    insert_data('country', county_list)
    return county_list


# 縣下級街道、鎮
def street(county_list):
    print('爬取街道資料中。。。')
    street_list = {}

    for i in county_list:
        t = http_request(county_list[i]['href'], 'gbk')
        parent_href = get_parent_url(county_list[i]['href'])

        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'towntr'}):
            id = str(v.find_all('td')[0].text)
            street_href = str(v.find_all('td')[1].find_all('a')[0].get('href'))
            street_list[id[0:9]] = {'id': id[0:9], 'name': str(v.find_all('td')[1].text), 'country_id': i, 'code': id,
                                    'href': parent_href + street_href}
    insert_data('street', street_list)
    return street_list


# 社群、街道辦
def community(street_list):
    print('爬取社群資料中。。。')
    community_list = {}
    for i in street_list:
        t = http_request(street_list[i]['href'], 'gbk')
        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'villagetr'}):
            id = str(v.find_all('td')[0].text)
            community_list[id[0:12]] = {'id': id[0:12], 'name': str(v.find_all('td')[2].text), 'street_id': i,
                                        'code': id, 'category': str(v.find_all('td')[1].text)}
    insert_data('community', community_list)
    return community_list


# 獲取上級url
def get_parent_url(href):
    # print('上級連結：' + href)
    arr = href.split('/')
    last_value = arr[len(arr) - 1]
    href = href.replace(last_value, '')

    return href


# 插入資料
def insert_data(table, data):
    if len(data) == 0:
        return False

    keys = list(data.keys())
    table_keys = '`'+'`,`'.join(list(data[keys[0]].keys()))+'`'

    for k in range(0, len(keys)):
        table_values = '\''+'\',\''.join(list(data[keys[int(k)]].values()))+'\''
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, table_keys, table_values)
        # exit(sql)
        try:
            cursor.execute(sql)
        except Exception as e:
            print(e)
    conn.commit()


if __name__ == '__main__':
    conn = pymysql.connect(host='192.168.1.11', port=3306, user='****', passwd='****', db='db_me', charset='utf8')
    cursor = conn.cursor()
    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
    header = {
        'Cookie': 'AD_RS_COOKIE=20181108',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '
                      'Chrome/58.0.3029.110 Safari/537.36'}
    provinceList = province()
    cityList = city(provinceList)
    countryList = country(cityList)
    streetList = street(countryList)
    communityList = community(streetList)
    print('資料抓取完成')

python抓取國家統計局省市區街道社群資料

python抓取國家統計局省市區街道社群資料

Python抓取學院新聞報告

python抓取

python抓取bing主頁背景圖片

無比強大！Python抓取cssmoban站點的模版並下載

Python抓取手機APP中內容

python 抓取cisco交換機配置文件

用python 抓取B站視頻評論，制作詞雲

Python抓取數據的幾種方式

python 抓取電影天堂電影信息放入數據庫

python 抓取"一個"網站文章信息放入數據庫

python 抓取內涵段子

Python抓取遠程文件獲取真實文件名

python: 抓取免費代理ip

Python 抓取網頁gb2312亂碼問題

Python - 抓取豆列

Python抓取京東商品信息

Python抓取新浪新聞數據（二）

Python抓取新浪新聞數據（三）

《一出好戲》講述人性，使用Python抓取貓眼近10萬條評論並分析，一起揭秘“這出好戲”到底如何？

python抓取國家統計局省市區街道社群資料

相關推薦