1. 程式人生 > 其它 >國家統計局爬2020年省市縣鄉村資料

國家統計局爬2020年省市縣鄉村資料

說明

費了好大的勁把資料從官網上爬下來並匯入到MySQL中
在這裡插入圖片描述
國家統計局官網地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/54/5402.html

爬蟲程式碼

package com.hc;

import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.hc.domain.*;
import com.hc.mapper.*;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import org.springframework.boot.test.context.SpringBootTest; import javax.annotation.Resource; import java.io.IOException; import java.util.ArrayList; import java.util.List;
/** * 全國省市縣鎮村資料爬取 * * @author 樑雲亮 */ @Slf4j @SpringBootTest public class InitAdd5Tables { /** * 建立連線 */ private Document connect(String url) { if (url == null || url.isEmpty()) { throw new IllegalArgumentException("無效的url"); } try { return
Jsoup.connect(url).timeout(100 * 1000).get(); } catch (IOException e) { System.out.println(url+"地址不存在"); return null; } } /** * 獲取所有的省份 * * @return */ public List<String> getProvinces() { List<String> res = new ArrayList<>(); Document connect = connect("http://localhost:8080/2020/default.htm"); Elements rowProvince = connect.select("tr.provincetr"); for (Element provinceElement : rowProvince) {// 遍歷每一行的省份城市 Elements select = provinceElement.select("a"); for (Element province : select) {// 每一個省份(四川省) String name = province.text(); String code = province.select("a").attr("href"); res.add(code.substring(0, code.lastIndexOf(".")) + "*" + name); } } return res; } @Test public void testGetProvince() { getProvinces().forEach(System.out::println); } @Resource private ProvinceMapper provinceMapper; @Test void insertProvinces() { List<Province> list = new ArrayList<>(); for (String p : getProvinces()) { String[] split = p.split("\\*"); Province province = Province.builder().code(split[0]).name(split[1]).build(); list.add(province); } //list.forEach(System.out::println); int res = provinceMapper.batchInsert(list); System.out.println(res); } /** * 根據省份編號獲取該省份下所有的市 * * @param provinceCode 省份編號 * @return */ public List<String> getCitiesByProvince(String provinceCode) { List<String> res = new ArrayList<>(); Document connect = connect("http://localhost:8080/2020/" + provinceCode + ".html"); Elements rowCity = connect.select("tr.citytr"); for (Element cityElement : rowCity) {// 遍歷每一行的省份城市 String name = cityElement.select("td").text(); String[] split = name.split(" "); res.add(split[0].substring(0, 4) + "*" + split[1]); } return res; } @Test public void testGetCitiesByProvince() { getCitiesByProvince("41").forEach(System.out::println); } @Resource private CityMapper cityMapper; @Test void insertCities() { List<String> pList = getProvinces(); for (String p : pList) { List<City> list = new ArrayList<>(); String[] split = p.split("\\*"); List<String> cList = getCitiesByProvince(split[0]); Province pp = provinceMapper.selectOne(new QueryWrapper<Province>().eq("code", split[0])); for (String c : cList) { String[] tmp = c.split("\\*"); City city = City.builder().name(tmp[1]).code(tmp[0]).provinceId(pp.getId()).build(); //System.out.println(city); list.add(city); } //一個省一個省的新增 int res = cityMapper.batchInsert(list); System.out.println(res); } } /** * 根據省市編號獲取該省份下所有的縣 * * @param cityCode 市編號 * @return */ public List<String> getCountriesByCity(String cityCode) { List<String> res = new ArrayList<>(); Document connect = connect("http://localhost:8080/2020/" + cityCode + ".html"); Elements rowCountry = connect.select("tr.countytr"); if (rowCountry.size() == 0) { Elements townCountry = connect.select("tr.towntr"); for (Element townElement : townCountry) { String txt = townElement.select("td").text(); String[] split = txt.split(" "); res.add(split[0].substring(0, 9) + "*" + split[1]); //比如海南省下的儋州市,只有4級目錄,沒有country } } else { for (Element countryElement : rowCountry) {// 遍歷每一行的省份城市 String txt = countryElement.select("td").text(); String[] split = txt.split(" "); res.add(split[0].substring(0, 6) + "*" + split[1]); } } return res; } @Test void testGetCountiesByProvince() { getCountriesByCity("46/4604").forEach(System.out::println); } @Resource private CountryMapper countryMapper; @Test void insertCountry() { List<String> pList = getProvinces(); for (int i = 0; i < pList.size(); i++) { String p = pList.get(i); String[] split = p.split("\\*"); //System.out.println(split[0] +" "+split[1]); // 13 河北省 List<String> cList = getCitiesByProvince(split[0]); for (String c : cList) { String[] split2 = c.split("\\*"); //System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市 List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]); City city = cityMapper.selectOne(new QueryWrapper<City>().eq("code", split2[0])); List<Country> list = new ArrayList<>(); for (String c2 : c2List) { String[] split3 = c2.split("\\*"); System.out.println(split3[0] + " * " + split3[1]); Country country = Country.builder().name(split3[1]).code(split3[0]).cityId(city.getId()).build(); list.add(country); } int res = countryMapper.batchInsert(list); System.out.println(res); } } } /** * 根據縣編號獲取鄉 * * @param countryCode * @return */ public List<String> getTownsByCountry(String countryCode) { List<String> res = new ArrayList<>(); Document connect = connect("http://localhost:8080/2020/" + countryCode + ".html"); if (connect != null) { Elements rowTown = connect.select("tr.towntr"); if (rowTown.size() == 0) { Elements rowVillage = connect.select("tr.villagetr"); for (Element villageElement : rowVillage) { String txt = villageElement.select("td").text(); String[] split = txt.split(" "); res.add(split[1] + "*" + split[2]); //比如海南省下的儋州市,只有4級目錄,沒有country } } else { for (Element townElement : rowTown) {// 遍歷每一行的省份城市 String txt = townElement.select("td").text(); String[] split = txt.split(" "); res.add(split[0].substring(0, 9) + "*" + split[1]); } } } return res; } @Test void testGetTownsByCountry() { getTownsByCountry("41/01/410122").forEach(System.out::println); } @Resource private TownMapper townMapper; @Test void insertTown() { List<String> pList = getProvinces(); for (int i = 0; i < pList.size(); i++) { String p = pList.get(i); String[] split = p.split("\\*"); //System.out.println(split[0] +" "+split[1]); // 13 河北省 List<String> cList = getCitiesByProvince(split[0]); for (String c : cList) { String[] split2 = c.split("\\*"); //System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市 List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]); for (String c2 : c2List) { String[] split3 = c2.split("\\*"); //System.out.println(split3[0] + " * " + split3[1]); //130324 * 盧龍縣 List<String> tList = getTownsByCountry(split3[0].substring(0, 2) + "/" + split3[0].substring(2, 4) + "/" + split3[0]); List<Town> list = new ArrayList<>(); Country country = countryMapper.selectOne(new QueryWrapper<Country>().eq("code", split3[0])); for (String t : tList) { String[] split4 = t.split("\\*"); //System.out.println(split4[0] + " * " + split4[1]); Town town = Town.builder().name(split4[1]).code(split4[0]).countryId(country.getId()).build(); //System.out.println(town); list.add(town); } if (list.size() != 0) { System.out.println(list); int res = townMapper.batchInsert(list); System.out.println(res); } } } } } /** * 根據鄉編號獲取村 * * @param townCode * @return */ public List<String> getVillagesByCountry(String townCode) { List<String> res = new ArrayList<>(); Document connect = connect("http://localhost:8080/2020/" + townCode + ".html"); Elements rowVillage = connect.select("tr.villagetr"); for (Element villageElement : rowVillage) {// 遍歷每一行的省份城市 String txt = villageElement.select("td").text(); String t = txt.substring(13); res.add(t); } return res; } @Test void testGetVillagesByCountry() { getVillagesByCountry("41/01/22/410122104").forEach(System.out::println); } @Resource private VillageMapper villageMapper; @Test void insertVillage() { List<String> pList = getProvinces(); for (int i = 25; i < pList.size(); i++) { String p = pList.get(i); String[] split = p.split("\\*"); //System.out.println(split[0] +" "+split[1]); // 13 河北省 List<String> cList = getCitiesByProvince(split[0]); for (String c : cList) { String[] split2 = c.split("\\*"); //System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市 List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]); for (String c2 : c2List) { String[] split3 = c2.split("\\*"); //System.out.println(split3[0] + " * " + split3[1]); //130324 * 盧龍縣 List<String> tList = getTownsByCountry(split3[0].substring(0, 2) + "/" + split3[0].substring(2, 4) + "/" + split3[0]); for (String t : tList) { String[] split4 = t.split("\\*"); if(split4[0].length()!=3){ //System.out.println(split4[0] + " * " + split4[1]); // 140802204 * 上郭鄉 List<String> vList = getVillagesByCountry(split4[0].substring(0, 2) + "/" + split4[0].substring(2, 4) + "/" + split4[0].substring(4, 6) + "/" + split4[0]); Town town = townMapper.selectOne(new QueryWrapper<Town>().eq("code", split4[0])); List<Village> list = new ArrayList<>(); for (String v : vList) { String[] split5 = v.split(" "); Village village = Village.builder().name(split5[1]).code(split5[0]).townId(town.getId()) .build(); list.add(village); } //System.out.println(list); int res = villageMapper.batchInsert(list); //System.out.println(res); } } } } } } }

資料庫SQL檔案下載地址

https://download.csdn.net/download/lianghecai52171314/18317507