爬一下最新的行政区划
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html
引入依赖
<!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j --> <dependency> <groupId>com.belerweb</groupId> <artifactId>pinyin4j</artifactId> <version>2.5.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok --> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.8</version> <scope>provided</scope> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.13.1</version> </dependency> <!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp --> <dependency> <groupId>com.squareup.okhttp3</groupId> <artifactId>okhttp</artifactId> <version>4.4.1</version> </dependency>
代码
package com.demo.tools; import com.alibaba.fastjson.JSON; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import okhttp3.ConnectionPool; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.*; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; /** * Created by 小LUA on 2020-03-30 11:39. */ @Slf4j public class GetProvince { private static final OkHttpClient client = new OkHttpClient.Builder() .connectTimeout(5, TimeUnit.MINUTES) .writeTimeout(5, TimeUnit.MINUTES) .readTimeout(5, TimeUnit.MINUTES) .connectionPool(new ConnectionPool(0, 30, TimeUnit.MINUTES)) .build(); /** * 获取首字母 * @param chinese * @return */ private static String getFirstSpell(String chinese) { try { HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.UPPERCASE); String[] temp = new String[0]; try { temp = PinyinHelper.toHanyuPinyinStringArray(chinese.charAt(0), defaultFormat); } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) { badHanyuPinyinOutputFormatCombination.printStackTrace(); } return temp[0].charAt(0)+""; } catch (Exception e){ return ""; } } /** * 读取URL内容 * @param url * @return * @throws IOException */ public static String readUrl(String url) throws IOException { System.out.println("读取URL:" + url); Request request = new Request.Builder() .url(url) .build(); Response response = client.newCall(request).execute(); String body = new String(response.body().bytes(), "gb2312"); // System.out.println(body); return body; } public static void main(String[] args) throws Exception { // 正则 Pattern pattern = Pattern.compile("[0-9]+"); // System.out.println(pattern.matcher("划代码").matches()); // System.out.println(pattern.matcher("110000").matches()); List<Location> all = new ArrayList<>(); String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/"; // 主页 Elements provinceList = null; do { String indexContent = readUrl(baseUrl + "index.html"); provinceList = Jsoup.parse(indexContent).getElementsByClass("provincetr"); System.out.println("provinceList是否为空:" + provinceList.isEmpty()); } while (provinceList.isEmpty()); for (Element pElement : provinceList) { // 获取省名称 + 子地址 Elements a = pElement.select("a"); for (Element e : a) { String pName = e.text(); String pHref = e.attr("href"); String pCode = pHref.substring(0, pHref.indexOf(".")); String cityUrl = baseUrl + pHref; System.out.println(pName + "," + pCode + "," + cityUrl); Long provinceCode = Long.valueOf(pCode + "0000"); // 设置省信息 Location provinceInfo = new Location(); provinceInfo.setCode(provinceCode); provinceInfo.setName(pName); provinceInfo.setLevel(1); provinceInfo.setLetterSort(getFirstSpell(pName)); List<Location> cities = new ArrayList<>(); provinceInfo.setChilds(cities); // 省下面:市 all.add(provinceInfo); // 读取城市页面 Elements cityList = null; do { String cityContent = readUrl(cityUrl); cityList = Jsoup.parse(cityContent).getElementsByClass("citytr"); System.out.println("cityList是否为空:" + cityList.isEmpty()); } while (cityList.isEmpty()); for (Element cElement : cityList) { Elements aa = cElement.select("a"); for (Element ee : aa) { String cName = ee.text(); // 过滤掉比如name为110100000000的数据,只需要取汉字的 if (pattern.matcher(cName).matches()){ continue; } String cHref = ee.attr("href"); String cCode = cHref.substring(cHref.indexOf("/")+1, cHref.indexOf(".")); String countyUrl = baseUrl + cHref; System.out.println(cName + "," + cCode + "," + countyUrl); Long cityCode = Long.valueOf(cCode + "00"); // 设置城市信息 Location city = new Location(); city.setCode(cityCode); city.setName(cName); city.setLevel(2); city.setLetterSort(getFirstSpell(cName)); List<Location> counties = new ArrayList<>(); city.setChilds(counties); // 市下面:区 city.setParentCode(provinceCode); cities.add(city); // 添加到城市列表 // 读取区页面 Elements countyList = null; do{ String countyContent = readUrl(countyUrl); countyList = Jsoup.parse(countyContent).getElementsByClass("countytr"); if ("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4419.html".equals(countyUrl) || "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4420.html".equals(countyUrl) || "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/46/4604.html".equals(countyUrl)){ countyList = Jsoup.parse(countyContent).getElementsByClass("towntr"); } System.out.println("countyList是否为空:" + countyList.isEmpty()); } while (countyList.isEmpty()); for (Element aElement : countyList) { Elements aaa = aElement.select("a"); for (Element eee : aaa) { String aName = eee.text(); // 过滤掉比如name为110100000000的数据,只需要取汉字的 if (pattern.matcher(aName).matches()){ continue; } String aHref = eee.attr("href"); String aCode = aHref.substring(aHref.indexOf("/")+1, aHref.indexOf(".")); System.out.println(aName + "," + aCode); // 设置区信息 Location county = new Location(); county.setCode(Long.valueOf(aCode)); county.setName(aName); county.setLevel(3); county.setLetterSort(getFirstSpell(aName)); county.setParentCode(cityCode); counties.add(county); // 添加到区列表 } } } } } } String jsonString = JSON.toJSONString(all); System.out.println(jsonString); write(jsonString); } public static void write(String str) throws IOException { FileOutputStream out = new FileOutputStream("2019省市区-大陆.json"); out.write(str.getBytes()); out.flush(); out.close(); } } @Getter @Setter class Location{ private Long code; private String name; private Integer level; private String letterSort; private Long parentCode; private List<Location> childs; }
爬完数据我只是存在了json文件里了,如果你需要存到数据库,只需要对 all 进行处理即可。或者读文件再处理
private static void read() throws IOException { FileInputStream in = new FileInputStream("2019省市区-大陆.json"); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); StringBuilder sb = new StringBuilder(); String t; while ((t = reader.readLine()) != null){ sb.append(t); } List<Location> cities = JSONArray.parseArray(sb.toString(), Location.class); cities.forEach(e -> { // TODO }); }
另附:全部数据(很小一部分的名称爬下来就是乱码,导致首字母识别不出来需要手动改正,搜索【"letterSort": ""】)
一共有6个,不算多。
数据文件:https://github.com/Mysakura/DataFiles
============================================
算了,我帮你们找出来了
{ "code": 341302, "letterSort": "Y", "level": 3, "name": "埇桥区", "parentCode": 341300 } { "code": 410304, "letterSort": "C", "level": 3, "name": "瀍河回族区", "parentCode": 410300 } { "code": 411502, "letterSort": "S", "level": 3, "name": "浉河区", "parentCode": 411500 } { "code": 420104, "letterSort": "Q", "level": 3, "name": "硚口区", "parentCode": 420100 } { "code": 420505, "letterSort": "X", "level": 3, "name": "猇亭区", "parentCode": 420500 } { "code": 610118, "letterSort": "H", "level": 3, "name": "鄠邑区", "parentCode": 610100 }