國家統計局爬2020年省市縣鄉村資料
阿新 • • 發佈:2021-05-04
說明
費了好大的勁把資料從官網上爬下來並匯入到MySQL中
國家統計局官網地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/54/5402.html
爬蟲程式碼
package com.hc;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.hc.domain.*;
import com.hc.mapper.*;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
import javax.annotation.Resource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 全國省市縣鎮村資料爬取
*
* @author 樑雲亮
*/
@Slf4j
@SpringBootTest
public class InitAdd5Tables {
/**
* 建立連線
*/
private Document connect(String url) {
if (url == null || url.isEmpty()) {
throw new IllegalArgumentException("無效的url");
}
try {
return Jsoup.connect(url).timeout(100 * 1000).get();
} catch (IOException e) {
System.out.println(url+"地址不存在");
return null;
}
}
/**
* 獲取所有的省份
*
* @return
*/
public List<String> getProvinces() {
List<String> res = new ArrayList<>();
Document connect = connect("http://localhost:8080/2020/default.htm");
Elements rowProvince = connect.select("tr.provincetr");
for (Element provinceElement : rowProvince) {// 遍歷每一行的省份城市
Elements select = provinceElement.select("a");
for (Element province : select) {// 每一個省份(四川省)
String name = province.text();
String code = province.select("a").attr("href");
res.add(code.substring(0, code.lastIndexOf(".")) + "*" + name);
}
}
return res;
}
@Test
public void testGetProvince() {
getProvinces().forEach(System.out::println);
}
@Resource
private ProvinceMapper provinceMapper;
@Test
void insertProvinces() {
List<Province> list = new ArrayList<>();
for (String p : getProvinces()) {
String[] split = p.split("\\*");
Province province = Province.builder().code(split[0]).name(split[1]).build();
list.add(province);
}
//list.forEach(System.out::println);
int res = provinceMapper.batchInsert(list);
System.out.println(res);
}
/**
* 根據省份編號獲取該省份下所有的市
*
* @param provinceCode 省份編號
* @return
*/
public List<String> getCitiesByProvince(String provinceCode) {
List<String> res = new ArrayList<>();
Document connect = connect("http://localhost:8080/2020/" + provinceCode + ".html");
Elements rowCity = connect.select("tr.citytr");
for (Element cityElement : rowCity) {// 遍歷每一行的省份城市
String name = cityElement.select("td").text();
String[] split = name.split(" ");
res.add(split[0].substring(0, 4) + "*" + split[1]);
}
return res;
}
@Test
public void testGetCitiesByProvince() {
getCitiesByProvince("41").forEach(System.out::println);
}
@Resource
private CityMapper cityMapper;
@Test
void insertCities() {
List<String> pList = getProvinces();
for (String p : pList) {
List<City> list = new ArrayList<>();
String[] split = p.split("\\*");
List<String> cList = getCitiesByProvince(split[0]);
Province pp = provinceMapper.selectOne(new QueryWrapper<Province>().eq("code", split[0]));
for (String c : cList) {
String[] tmp = c.split("\\*");
City city = City.builder().name(tmp[1]).code(tmp[0]).provinceId(pp.getId()).build();
//System.out.println(city);
list.add(city);
}
//一個省一個省的新增
int res = cityMapper.batchInsert(list);
System.out.println(res);
}
}
/**
* 根據省市編號獲取該省份下所有的縣
*
* @param cityCode 市編號
* @return
*/
public List<String> getCountriesByCity(String cityCode) {
List<String> res = new ArrayList<>();
Document connect = connect("http://localhost:8080/2020/" + cityCode + ".html");
Elements rowCountry = connect.select("tr.countytr");
if (rowCountry.size() == 0) {
Elements townCountry = connect.select("tr.towntr");
for (Element townElement : townCountry) {
String txt = townElement.select("td").text();
String[] split = txt.split(" ");
res.add(split[0].substring(0, 9) + "*" + split[1]);
//比如海南省下的儋州市,只有4級目錄,沒有country
}
} else {
for (Element countryElement : rowCountry) {// 遍歷每一行的省份城市
String txt = countryElement.select("td").text();
String[] split = txt.split(" ");
res.add(split[0].substring(0, 6) + "*" + split[1]);
}
}
return res;
}
@Test
void testGetCountiesByProvince() {
getCountriesByCity("46/4604").forEach(System.out::println);
}
@Resource
private CountryMapper countryMapper;
@Test
void insertCountry() {
List<String> pList = getProvinces();
for (int i = 0; i < pList.size(); i++) {
String p = pList.get(i);
String[] split = p.split("\\*");
//System.out.println(split[0] +" "+split[1]); // 13 河北省
List<String> cList = getCitiesByProvince(split[0]);
for (String c : cList) {
String[] split2 = c.split("\\*");
//System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市
List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]);
City city = cityMapper.selectOne(new QueryWrapper<City>().eq("code", split2[0]));
List<Country> list = new ArrayList<>();
for (String c2 : c2List) {
String[] split3 = c2.split("\\*");
System.out.println(split3[0] + " * " + split3[1]);
Country country = Country.builder().name(split3[1]).code(split3[0]).cityId(city.getId()).build();
list.add(country);
}
int res = countryMapper.batchInsert(list);
System.out.println(res);
}
}
}
/**
* 根據縣編號獲取鄉
*
* @param countryCode
* @return
*/
public List<String> getTownsByCountry(String countryCode) {
List<String> res = new ArrayList<>();
Document connect = connect("http://localhost:8080/2020/" + countryCode + ".html");
if (connect != null) {
Elements rowTown = connect.select("tr.towntr");
if (rowTown.size() == 0) {
Elements rowVillage = connect.select("tr.villagetr");
for (Element villageElement : rowVillage) {
String txt = villageElement.select("td").text();
String[] split = txt.split(" ");
res.add(split[1] + "*" + split[2]);
//比如海南省下的儋州市,只有4級目錄,沒有country
}
} else {
for (Element townElement : rowTown) {// 遍歷每一行的省份城市
String txt = townElement.select("td").text();
String[] split = txt.split(" ");
res.add(split[0].substring(0, 9) + "*" + split[1]);
}
}
}
return res;
}
@Test
void testGetTownsByCountry() {
getTownsByCountry("41/01/410122").forEach(System.out::println);
}
@Resource
private TownMapper townMapper;
@Test
void insertTown() {
List<String> pList = getProvinces();
for (int i = 0; i < pList.size(); i++) {
String p = pList.get(i);
String[] split = p.split("\\*");
//System.out.println(split[0] +" "+split[1]); // 13 河北省
List<String> cList = getCitiesByProvince(split[0]);
for (String c : cList) {
String[] split2 = c.split("\\*");
//System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市
List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]);
for (String c2 : c2List) {
String[] split3 = c2.split("\\*");
//System.out.println(split3[0] + " * " + split3[1]); //130324 * 盧龍縣
List<String> tList = getTownsByCountry(split3[0].substring(0, 2) + "/" + split3[0].substring(2, 4) + "/" + split3[0]);
List<Town> list = new ArrayList<>();
Country country = countryMapper.selectOne(new QueryWrapper<Country>().eq("code", split3[0]));
for (String t : tList) {
String[] split4 = t.split("\\*");
//System.out.println(split4[0] + " * " + split4[1]);
Town town = Town.builder().name(split4[1]).code(split4[0]).countryId(country.getId()).build();
//System.out.println(town);
list.add(town);
}
if (list.size() != 0) {
System.out.println(list);
int res = townMapper.batchInsert(list);
System.out.println(res);
}
}
}
}
}
/**
* 根據鄉編號獲取村
*
* @param townCode
* @return
*/
public List<String> getVillagesByCountry(String townCode) {
List<String> res = new ArrayList<>();
Document connect = connect("http://localhost:8080/2020/" + townCode + ".html");
Elements rowVillage = connect.select("tr.villagetr");
for (Element villageElement : rowVillage) {// 遍歷每一行的省份城市
String txt = villageElement.select("td").text();
String t = txt.substring(13);
res.add(t);
}
return res;
}
@Test
void testGetVillagesByCountry() {
getVillagesByCountry("41/01/22/410122104").forEach(System.out::println);
}
@Resource
private VillageMapper villageMapper;
@Test
void insertVillage() {
List<String> pList = getProvinces();
for (int i = 25; i < pList.size(); i++) {
String p = pList.get(i);
String[] split = p.split("\\*");
//System.out.println(split[0] +" "+split[1]); // 13 河北省
List<String> cList = getCitiesByProvince(split[0]);
for (String c : cList) {
String[] split2 = c.split("\\*");
//System.out.println(split2[0] +" * "+split2[1]);//1301 * 石家莊市
List<String> c2List = getCountriesByCity(split2[0].substring(0, 2) + "/" + split2[0]);
for (String c2 : c2List) {
String[] split3 = c2.split("\\*");
//System.out.println(split3[0] + " * " + split3[1]); //130324 * 盧龍縣
List<String> tList = getTownsByCountry(split3[0].substring(0, 2) + "/" + split3[0].substring(2, 4) + "/" + split3[0]);
for (String t : tList) {
String[] split4 = t.split("\\*");
if(split4[0].length()!=3){
//System.out.println(split4[0] + " * " + split4[1]); // 140802204 * 上郭鄉
List<String> vList = getVillagesByCountry(split4[0].substring(0, 2) + "/" + split4[0].substring(2, 4) + "/" + split4[0].substring(4, 6) + "/" + split4[0]);
Town town = townMapper.selectOne(new QueryWrapper<Town>().eq("code", split4[0]));
List<Village> list = new ArrayList<>();
for (String v : vList) {
String[] split5 = v.split(" ");
Village village = Village.builder().name(split5[1]).code(split5[0]).townId(town.getId()) .build();
list.add(village);
}
//System.out.println(list);
int res = villageMapper.batchInsert(list);
//System.out.println(res);
}
}
}
}
}
}
}
資料庫SQL檔案下載地址
https://download.csdn.net/download/lianghecai52171314/18317507