Java 爬取國家統計局統計用區劃程式碼和城鄉劃分程式碼
阿新 • • 發佈:2018-12-17
插入速度比較慢,你可以改成查詢到一條資料就插入,或者儲存一個 List 批量插入。 用的 Spring Boot2、MyBatis Plus(Jdbc 都行,隨便你)、Junit5、okhttp、jsoup、dozer(你可以手動賦值,沒幾個屬性)。 總共 697,103 條資料。
import top.duanluan.Application;
import top.duanluan.entity.AdministrativeDivision;
import top.duanluan.service.IAdministrativeDivisionService;
import okhttp3. OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.collections4.CollectionUtils;
import org.dozer.DozerBeanMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter. api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.IOException;
import java.io. InputStream;
import java.net.URL;
import java.util.*;
import java.util.concurrent.TimeUnit;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class)
public class AdministrativeDivisionTest {
@Autowired
private IAdministrativeDivisionService administrativeDivisionService;
@Autowired
private DozerBeanMapper dozerBeanMapper;
private static final String INDEX_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/";
private static final String CHARSET_NAME_GB2312 = "GB2312";
private static final String CHARSET_NAME_GBK = "GBK";
private static final String[] CLASS_NAMES = {".citytr", ".countytr", ".towntr", ".villagetr"};
@Test
void test() throws IOException, InterruptedException {
InputStream inputStream = new URL(INDEX_URL).openStream();
Document doc = Jsoup.parse(inputStream, CHARSET_NAME_GB2312, INDEX_URL);
inputStream.close();
Elements provinceElements = doc.select(".provincetr");
List<Map> privinceList = new LinkedList<>();
Map privinceMap;
for (Element provinceElement : provinceElements) {
Elements privinceLinks = provinceElement.select("a");
for (Element privinceLink : privinceLinks) {
privinceMap = new LinkedHashMap();
privinceMap.put("name", privinceLink.text());
List<Map> childList;
while (true) {
// 遞迴獲取 Child
getChild(privinceMap, INDEX_URL + privinceLink.attr("href"), 0);
childList = (List<Map>) privinceMap.get("child");
// 莫名其妙,不知道為什麼會出現 childList 為空的情況。
if (CollectionUtils.isNotEmpty(childList)) {
break;
}
System.out.println("childList 為空");
}
String code = childList.get(0).get("code").toString();
privinceMap.put("code", code.substring(0, 2) + "0000000000");
privinceList.add(privinceMap);
// break;
}
// break;
}
// 遞迴儲存
save(privinceList, 0L, 0);
}
/**
* 遞迴儲存
*
* @param list
* @param parentId
* @param level
* @throws InterruptedException
*/
private void save(List<Map> list, Long parentId, int level) throws InterruptedException {
if (level == CLASS_NAMES.length + 1) {
return;
}
level += 1;
if (list != null) {
for (Map map : list) {
AdministrativeDivision administrativeDivision = dozerBeanMapper.map(map, AdministrativeDivision.class);
administrativeDivision.setParentId(parentId);
administrativeDivision.setLevel((short) level);
while (true) {
try {
administrativeDivisionService.insert(administrativeDivision);
} catch (Exception e) {
if ("connection holder is null".equals(e.getMessage())) {
TimeUnit.MINUTES.sleep(1);
continue;
}
break;
}
break;
}
save((List<Map>) map.get("child"), administrativeDivision.getId(), level);
}
}
}
/**
* 獲取 child
*
* @param map
* @param url
* @param level
*/
private void getChild(Map map, String url, int level) {
if (level == CLASS_NAMES.length) {
return;
}
System.out.println(url);
Document doc;
while (true) {
try {
// OkHttpClient okHttpClient = new OkHttpClient.Builder().connectTimeout(1, TimeUnit.SECONDS).readTimeout(3, TimeUnit.SECONDS).build();
OkHttpClient okHttpClient = new OkHttpClient();
Request request = new Request.Builder().url(url).build();
Response response = okHttpClient.newCall(request).execute();
if (!response.isSuccessful()) {
continue;
}
byte[] bodyBytes = response.body().bytes();
String bodyText = new String(bodyBytes, CHARSET_NAME_GB2312);
if (bodyText.contains("�")) {
bodyText = new String(bodyBytes, CHARSET_NAME_GBK);
}
doc = Jsoup.parse(bodyText);
break;
} catch (IOException e) {
// e.printStackTrace();
System.out.println(e.getMessage());
}
}
List<Map> childList = new LinkedList<>();
Elements Elements = doc.select(CLASS_NAMES[level]);
level += 1;
Map childMap;
for (Element element : Elements) {
Elements links = element.select("td a");
// 市轄區
boolean isContinue = true;
if (links.size() == 0) {
links = element.select("td");
isContinue = false;
}
Element codeLink = links.first();
childMap = new LinkedHashMap();
childMap.put("code", codeLink.text());
childMap.put("name", links.last().text());
if (isContinue) {
getChild(childMap, url.substring(0, url.lastIndexOf("/") + 1) + codeLink.attr("href"), level);
}
childList.add(childMap);
}
map.put("level", level);
map.put("child", childList);
}
}