• 【爬虫】Java爬取省市县行政区域统计数据


    前言

    网上看了好几个Python爬虫来爬取省市县行政区域统计

    官网除了省市县以外,还有区,街道,居委村委层级

    https://zhuanlan.zhihu.com/p/512852193
    

    所以自己用Java写一个完整爬取的,之前写过的一版不是很理想

    这次换了更轻量的库来重构,逻辑也直观些

    依赖库:

    Hutool工具库,有Http工具包和DB操作的API

    Jsoup解析HTML代码,爬虫标配

    Lombok简化PO

    放FastJson是考虑可能不用DB存放,直接写JSON文件,在这里没用到

        <dependencies>
            <dependency>
                <groupId>cn.hutool</groupId>
                <artifactId>hutool-all</artifactId>
                <version>5.8.4</version>
            </dependency>
    
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <version>8.0.15</version>
            </dependency>
    
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.62</version>
            </dependency>
    
            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.13.1</version>
            </dependency>
    
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <version>1.18.10</version>
            </dependency>
    
            <!-- 连接池https://mvnrepository.com/artifact/com.alibaba/druid -->
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>druid</artifactId>
                <version>1.1.14</version>
            </dependency>
    
        </dependencies>
    

    HutoolDb需要的配置文件:

    ## db.setting文件
    
    url = jdbc:mysql://localhost:3308/my?serverTimezone=Asia/Shanghai
    user = root
    pass = 123456
    
    ## 可选配置
    # 是否在日志中显示执行的SQL
    showSql = true
    # 是否格式化显示的SQL
    formatSql = false
    # 是否显示SQL参数
    showParams = true
    # 打印SQL的日志等级,默认debug,可以是info、warn、error
    sqlLevel = debug
    
    #----------------------------------------------------------------------------------------------------------------
    ## 连接池配置项
    #————————————————
    #版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
    #原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633
    
    ## ---------------------------------------------------- Druid
    # 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时
    initialSize = 1
    # 最大连接池数量
    maxActive = 8
    # 最小连接池数量
    minIdle = 0
    # 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。
    maxWait = 0
    # 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。
    poolPreparedStatements = false
    # 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100
    maxOpenPreparedStatements = -1
    # 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。
    validationQuery = SELECT 1
    # 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。
    testOnBorrow = true
    # 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能
    testOnReturn = false
    # 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。
    testWhileIdle = false
    # 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明
    timeBetweenEvictionRunsMillis = 60000
    # 物理连接初始化的时候执行的sql
    connectionInitSqls = SELECT 1
    # 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat  日志用的filter:log4j 防御sql注入的filter:wall
    # filters = stat
    # 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系
    # proxyFilters =

    表结构:

    CREATE TABLE `region2021` (
      `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码',
      `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码',
      `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称',
      `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
      `LEVEL` int DEFAULT NULL COMMENT '层级',
      `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型',
      `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间',
      PRIMARY KEY (`CODE`) USING BTREE
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;
    

      

    代码部分:

    封装的PO类:

    package cn.cloud9.rdp.po;
    
    import lombok.*;
    import java.time.LocalDateTime;
    
    /**
     * 行政区域最小存储单元
     * @projectName: 行政区域爬取工具
     * @author: Cloud9
     * @date: 2022年06月29日 10:30
     * @version: 1.0
     */
    @Data
    @EqualsAndHashCode
    @AllArgsConstructor
    @NoArgsConstructor
    @Builder
    public class RegionCell {
        // 名称
        private String name;
        // 下一层的访问地址
        private String url;
        // 统计用区分代码
        private String regionCode;
        // 统计用区分代码(上级代码)
        private String parentCode;
        // 城乡分类代码
        private String typeCode;
        // 行政区的层级
        private Integer level;
        // 创建时间
        private LocalDateTime genTime;
    }
    

      

    Main启动类:

    本来是想用递归写逻辑的,但是每一层的逻辑不是完全一样,所以不采用递归

    层级是可以确认的,最底层到village就没有了

    package cn.cloud9.rdp;
    
    import cn.cloud9.rdp.po.RegionCell;
    import cn.hutool.core.collection.CollectionUtil;
    import cn.hutool.db.Db;
    import cn.hutool.db.Entity;
    import cn.hutool.http.HttpRequest;
    import cn.hutool.http.HttpResponse;
    import cn.hutool.http.HttpUtil;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.net.HttpCookie;
    import java.sql.SQLException;
    import java.time.LocalDateTime;
    import java.util.List;
    import java.util.Map;
    import java.util.Random;
    import java.util.concurrent.ConcurrentHashMap;
    
    /**
     * @projectName: 行政区域爬取工具
     * @author: Cloud9
     * @date: 2022年06月29日 09:55
     * @version: 1.0
     */
    public class MainApplication {
    
        public static final String HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
        public static final String COOKIE_KEY = "SF_cookie_1";
        public static final String COOKIE_HEADER_KEY = "Cookie";
        public static final String USER_AGENT = "User-Agent";
        public static final String[] BROWSER_AGENTS = {
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"
        };
        public final static Random R = new Random();
        public static final String TABLE_NAME = "region2021";
    
        public static Map<String, String> getNewRequestHeader() {
            return new ConcurrentHashMap<String, String>(){{
                this.put("Host", "www.stats.gov.cn");
    //            this.put("Server", "CWAP-waf");
    //            this.put("Content-Length", "2351");
    //            this.put("Content-Type", "text/html");
    //            this.put("Accept-Ranges", "bytes");
    //            this.put("Content-Encoding", "gzip");
    //            this.put("ETag", "1736-5d4bccabedf00-gzip");
    //            this.put("Vary", "Accept-Encoding");
    //            this.put("X-Powered-By", "anyu.qianxin.com");
    //            this.put("WZWS-RAY", "1129-1656502841.38-w-waf03cdm");
            }};
        }
    
    
        public static void main(String[] args) throws SQLException {
    
            // 创建MySQL连接
            Db db = Db.use();
            // 清空表记录
            db.execute("TRUNCATE TABLE region2021", null);
    
            // 第一层请求
            HttpRequest getRequest = HttpUtil.createGet(HEADER_URL);
            HttpResponse httpResponse = getRequest.execute();
            HttpCookie cookie = httpResponse.getCookie(COOKIE_KEY);
            System.out.println(cookie);
    
            // 取响应状态
            int status = httpResponse.getStatus();
            // 取响应data
            String body = httpResponse.body();
            System.out.println("省 响应状态 " + status);
            System.out.println("省 响应状态 " + body);
            if (status != 200) {
                System.out.println("爬取异常,程序终止");
                return;
            }
    
            // 解析HTML文档,封装成文档对象
            final Document DOC = Jsoup.parse(body);
            // 省份是 class="provincetr" 的tr标签,
            Elements provinceTrList = DOC.getElementsByClass("provincetr");
            provinceTrList.forEach(tr -> {
                // 每个tr标签内部嵌套了若干省份 a标签
                Elements provinceAList = tr.getElementsByTag("a");
                provinceAList.forEach(a -> {
                    int provinceTrIndex = provinceTrList.indexOf(tr) + 1;
                    int provinceALinkIndex = provinceAList.indexOf(a) + 1;
    
                    RegionCell cell = RegionCell.builder()
                        // 文本就是省份
                        .name(a.text())
                        // 下一级的地址是 首页地址 + 标签存的地址
                        .url(HEADER_URL + a.attr("href"))
                        // 第一级没有行政区编号,我自己设定规则是(tr元素下标 + a元素下标)组合
                        .regionCode(String.valueOf(provinceTrIndex) + provinceALinkIndex)
                        // 省级没有
                        .parentCode(String.valueOf(0))
                        .genTime(LocalDateTime.now())
                        .build();
                    System.out.println(cell);
    
                    // 插入省份
                    try {
                        db.insertOrUpdate(Entity
                            .create(TABLE_NAME)
                            .set("CODE", cell.getRegionCode())
                            .set("PARENT_CODE", cell.getParentCode())
                            .set("NAME", cell.getName())
                            .set("LEVEL", 1)
                            .set("LINK", cell.getUrl())
                            .set("GEN_TIME", cell.getGenTime())
                        );
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }
    
                    // 设置header头,
                    Map<String, String> header = getNewRequestHeader();
                    // 把首次请求提供的Cookie放进来
                    header.put(COOKIE_HEADER_KEY, cookie.toString());
                    // 设置每次请求时,伪装不同的浏览器访问
                    header.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                    HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute();
                    // 保留本次cookie,给下一级请求时提供
                    List<HttpCookie> provinceCookies = response.getCookies();
                    // 不判断状态,默认200
                    int provinceStatus = response.getStatus();
                    System.out.println(cell.getName() + " | " + cell.getUrl() + " | 请求状态" + provinceStatus);
                    String provinceBody = response.body();
                    final Document provinceDOC = Jsoup.parse(provinceBody);
    
                    // 市级取的 class="citytr"
                    Elements cityTrList = provinceDOC.getElementsByClass("citytr");
                    cityTrList.forEach(cityTr -> {
                        // 每个tr 存放一个城市, 第一个td 放编码 第二个td放城市名
                        // 有可能这个城市没有下一级,td就不嵌套a元素, 所以这里写了判断逻辑
                        Elements cityTds = cityTr.getElementsByTag("td");
                        Element codeTd = cityTds.get(0);
                        Element nameTd = cityTds.get(1);
                        Elements aTagInCodeTd = codeTd.getElementsByTag("a");
                        Elements aTagInNameTd = nameTd.getElementsByTag("a");
    
                        // 取城乡代码
                        String regionCityCode;
                        String cityNextHref = null;
    
                        boolean isLinkTag = !CollectionUtil.isEmpty(aTagInCodeTd);
                        if (isLinkTag) {
                            // 编码a元素和城市a元素都会放href, 这里按编码的来取
                            regionCityCode = aTagInCodeTd.get(0).text();
                            cityNextHref = aTagInCodeTd.get(0).attr("href");
                        } else regionCityCode = codeTd.text();
                        String regionName;
                        if (!CollectionUtil.isEmpty(aTagInNameTd)) regionName = aTagInNameTd.get(0).text();
                        else regionName = nameTd.text();
    
                        RegionCell.RegionCellBuilder cellBuilder = RegionCell.builder();
                        // 城市的下一级Link 也是一样,用首地址 + href放的地址拼接组成
                        if (isLinkTag)  cellBuilder.url(HEADER_URL + cityNextHref);
    
                        cellBuilder.name(regionName);
                        cellBuilder.regionCode(regionCityCode);
                        cellBuilder.parentCode(cell.getRegionCode());
                        cellBuilder.genTime(LocalDateTime.now()).build();
                        RegionCell cityCell = cellBuilder.build();
                        System.out.println(cityCell);
    
                        try {
                            db.insertOrUpdate(Entity
                                .create(TABLE_NAME)
                                .set("CODE", cityCell.getRegionCode())
                                .set("PARENT_CODE", cityCell.getParentCode())
                                .set("NAME", cityCell.getName())
                                .set("LEVEL", 2)
                                .set("LINK", cityCell.getUrl())
                                .set("GEN_TIME", cityCell.getGenTime())
                            );
                        } catch (SQLException e) {
                            e.printStackTrace();
                        }
    
                        if (null == cityNextHref)
                            return;
    
                        Map<String, String> countyHeader = getNewRequestHeader();
                        provinceCookies.forEach( provinceCookie -> countyHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";"));
                        countyHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                        HttpResponse countyResponse = HttpUtil.createGet(cityCell.getUrl()).addHeaders(countyHeader).execute();
                        List<HttpCookie> countyCookies = countyResponse.getCookies();
                        int countyStatus = countyResponse.getStatus();
                        String countyBody = countyResponse.body();
                        System.out.println(cityCell.getName() + " | " + cityCell.getUrl() + " | 请求状态" + countyStatus);
                        Document countyDOC = Jsoup.parse(countyBody);
    
                        Elements countyTrList = countyDOC.getElementsByClass("countytr");
                        countyTrList.forEach(countyTr -> {
    
                            Elements countyTds = countyTr.getElementsByTag("td");
                            Element countyCodeTd = countyTds.get(0);
                            Element countyNameTd = countyTds.get(1);
                            Elements aTagInCountyCodeTd = countyCodeTd.getElementsByTag("a");
                            Elements aTagInCountyNameTd = countyNameTd.getElementsByTag("a");
                            // 取城乡代码
                            String regionCountyCode;
                            String countyNextHref = null;
    
                            boolean isCountyLinkTag = !CollectionUtil.isEmpty(aTagInCountyCodeTd);
                            if (isCountyLinkTag) {
                                regionCountyCode = aTagInCountyCodeTd.get(0).text();
                                countyNextHref = aTagInCountyCodeTd.get(0).attr("href");
                            } else regionCountyCode = countyCodeTd.text();
    
                            String regionCountyName;
                            if (!CollectionUtil.isEmpty(aTagInCountyNameTd)) regionCountyName = aTagInCountyNameTd.get(0).text();
                            else regionCountyName = countyNameTd.text();
    
                            RegionCell.RegionCellBuilder countyCellBuilder = RegionCell.builder();
                            if (isCountyLinkTag) {
                                // 县级的href需要截取处理。 /xxxxxx.html -> 首地址/xx/xxxxxx.html这样
                                int index = countyNextHref.indexOf('/');
                                String provincePath = countyNextHref.substring(index + 1, index + 3) + "/";
                                String countyUrl = HEADER_URL + provincePath + countyNextHref;
                                countyCellBuilder.url(countyUrl);
                            }
                            countyCellBuilder.name(regionCountyName);
                            countyCellBuilder.regionCode(regionCountyCode);
                            countyCellBuilder.genTime(LocalDateTime.now());
                            countyCellBuilder.parentCode(cityCell.getRegionCode());
                            RegionCell countyCell = countyCellBuilder.build();
                            System.out.println(countyCell);
    
    
                            try {
                                db.insertOrUpdate(Entity
                                    .create(TABLE_NAME)
                                    .set("CODE", countyCell.getRegionCode())
                                    .set("PARENT_CODE", countyCell.getParentCode())
                                    .set("NAME", countyCell.getName())
                                    .set("LEVEL", 3)
                                    .set("LINK", countyCell.getUrl())
                                    .set("GEN_TIME", countyCell.getGenTime())
                                );
                            } catch (SQLException e) {
                                e.printStackTrace();
                            }
    
                            if (null == countyNextHref)
                                return;
    
                            Map<String, String> townHeader = getNewRequestHeader();
                            countyCookies.forEach(
                                    provinceCookie -> townHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";"));
                            townHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                            HttpResponse townResponse = HttpUtil.createGet(countyCell.getUrl()).addHeaders(townHeader).execute();
                            List<HttpCookie> townCookies = townResponse.getCookies();
                            int townStatus = townResponse.getStatus();
                            String townBody = townResponse.body();
                            System.out.println(countyCell.getName() + " | " + countyCell.getUrl() + " | 请求状态" + townStatus);
                            Document townDOC = Jsoup.parse(townBody);
                            Elements townTrList = townDOC.getElementsByClass("towntr");
                            townTrList.forEach(townTr -> {
    
                                Elements townTds = townTr.getElementsByTag("td");
                                Element townCodeTd = townTds.get(0);
                                Element townNameTd = townTds.get(1);
                                Elements aTagInTownCodeTd = townCodeTd.getElementsByTag("a");
                                Elements aTagInTownNameTd = townNameTd.getElementsByTag("a");
                                // 取城乡代码
                                String regionTownCode;
                                String townNextHref = null;
    
                                boolean isTownLinkTag = !CollectionUtil.isEmpty(aTagInTownCodeTd);
                                if (isTownLinkTag) {
                                    regionTownCode = aTagInTownCodeTd.get(0).text();
                                    townNextHref = aTagInTownCodeTd.get(0).attr("href");
                                } else regionTownCode = townCodeTd.text();
                                String regionTownName;
                                if (!CollectionUtil.isEmpty(aTagInTownNameTd)) regionTownName = aTagInTownNameTd.get(0).text();
                                else regionTownName = townNameTd.text();
    
                                RegionCell.RegionCellBuilder townCellBuilder = RegionCell.builder();
                                if (isTownLinkTag) {
                                    int index = townNextHref.indexOf("/");
                                    String provincePath = townNextHref.substring(index + 1, index + 3);
                                    String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/";
                                    String url = HEADER_URL + provincePath + cityPath + townNextHref;
                                    townCellBuilder.url(url);
                                }
    
                                townCellBuilder.name(regionTownName);
                                townCellBuilder.regionCode(regionTownCode);
                                townCellBuilder.genTime(LocalDateTime.now());
                                townCellBuilder.parentCode(countyCell.getRegionCode());
                                RegionCell townCell = townCellBuilder.build();
                                System.out.println(townCell);
    
                                try {
                                    db.insertOrUpdate(Entity
                                        .create(TABLE_NAME)
                                        .set("CODE", townCell.getRegionCode())
                                        .set("PARENT_CODE", townCell.getParentCode())
                                        .set("NAME", townCell.getName())
                                        .set("LEVEL", 4)
                                        .set("LINK", townCell.getUrl())
                                        .set("GEN_TIME", townCell.getGenTime())
                                    );
                                } catch (SQLException e) {
                                    e.printStackTrace();
                                }
    
    
                                if (null == townNextHref)
                                    return;
    
                                Map<String, String> villageHeader = getNewRequestHeader();
                                townCookies.forEach(townCookie -> villageHeader.put(COOKIE_HEADER_KEY, townCookie.toString() + ";"));
                                villageHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                                HttpResponse villageResponse = HttpUtil.createGet(townCell.getUrl()).addHeaders(villageHeader).execute();
                                int villageStatus = villageResponse.getStatus();
                                String villageBody = villageResponse.body();
                                System.out.println(townCell.getName() + " | " + townCell.getUrl() + " | 请求状态" + villageStatus);
                                Document villageDOC = Jsoup.parse(villageBody);
                                Elements villageTrList = villageDOC.getElementsByClass("villagetr");
    
                                villageTrList.forEach(villageTr -> {
                                    Elements villageTdList = villageTr.getElementsByTag("td");
    
                                    Element villageCodeTd = villageTdList.get(0);
                                    Element villageTypeCodeTd = villageTdList.get(1);
                                    Element villageNameTd = villageTdList.get(2);
    
                                    RegionCell.RegionCellBuilder villageCellBuilder = RegionCell.builder();
                                    villageCellBuilder.regionCode(villageCodeTd.text());
                                    villageCellBuilder.name(villageNameTd.text());
                                    villageCellBuilder.typeCode(villageTypeCodeTd.text());
                                    villageCellBuilder.parentCode(townCell.getRegionCode());
                                    villageCellBuilder.genTime(LocalDateTime.now());
                                    RegionCell villageCell = villageCellBuilder.build();
                                    System.out.println(villageCell);
    
                                    try {
                                        db.insert(Entity
                                            .create(TABLE_NAME)
                                            .set("CODE", villageCell.getRegionCode())
                                            .set("PARENT_CODE", villageCell.getParentCode())
                                            .set("NAME", villageCell.getName())
                                            .set("LEVEL", 5)
                                            .set("TYPE_CODE", villageCell.getTypeCode())
                                            .set("LINK", villageCell.getUrl())
                                            .set("GEN_TIME", villageCell.getGenTime())
                                        );
                                    } catch (SQLException e) {
                                        e.printStackTrace();
                                    }
                                });
                            });
                        });
                    });
                });
            });
    
            System.out.println("爬取完毕");
        }
    }
    

      

    注意项:

    在爬取居委村委层级时存在反爬限制,请求会被阻塞10分钟,程序不会报链接超时

    这个问题暂时没找到解决办法,就是这样爬取的效率慢很多

    我的思路是想,可不可以判断是否阻塞,如果阻塞就直接重新请求尝试

    2022年7月6日22点49分更新:

    对请求进行封装,使用递归不停止请求

        private static HttpResponse retryConn(HttpRequest httpRequest) {
            HttpResponse httpResponse = null;
            try {
                httpResponse = httpRequest
                        .timeout(TIMEOUT)
                        .setConnectionTimeout(TIMEOUT)
                        .setReadTimeout(TIMEOUT)
                        .execute();
            } catch (Exception exception) {
                exception.printStackTrace();
                return retryConn(httpRequest);
            }
            return httpResponse;
        }
    

     

    所有的请求都这样改成异常递归执行

    // HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute();
    HttpResponse response = retryConn(HttpUtil.createGet(cell.getLink()).addHeaders(header)); 

    但是发现还是有爆栈的情况:

    所以有对应的写了一份补数据的逻辑:

    1、补数据直接采用递归实现

    2、首先要查询已经爬取的数据,查询那些本该有子节点,但实际为空的记录

    3、查询得到之后遍历记录记载的LINK,继续爬取

    4、发现44和46两个省份的link规则不一样,要单独做调整(Ctrl + F 输44,搜下面的代码)

    5、LEVEL的层级也不能市确定的1 - 2 - 3 - 4 - 5, 所以改用自连接LEFT JOIN,根据上级代码查询

    6、逻辑可重复执行

    package cn.cloud9.fix;
    
    import cn.cloud9.po.RegionCell;
    import cn.hutool.core.collection.CollUtil;
    import cn.hutool.db.Db;
    import cn.hutool.db.Entity;
    import cn.hutool.http.HttpRequest;
    import cn.hutool.http.HttpResponse;
    import cn.hutool.http.HttpUtil;
    import com.alibaba.druid.util.StringUtils;
    import lombok.SneakyThrows;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.sql.SQLException;
    import java.time.LocalDateTime;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.Random;
    
    /**
     * @author OnCloud9
     * @description
     * @project RegionReptile
     * @date 2022年07月04日 下午 09:21
     */
    public class DataFixApplication {
        private static final int TIMEOUT = 3000;
    
        private static String YEAR;
        private static String HEADER_URL;
        private static String TABLE_NAME;
        public static final String CODE = "CODE";
        public static final String PARENT_CODE = "PARENT_CODE";
        public static final String NAME = "NAME";
        public static final String LEVEL = "LEVEL";
        public static final String TYPE_CODE = "TYPE_CODE";
        public static final String LINK = "LINK";
        public static final String GEN_TIME = "GEN_TIME";
        public static final String REFERER = "Referer";
        public static final String USER_AGENT = "User-Agent";
        public static final String[] BROWSER_AGENTS = {
                "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
                "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
        };
        public final static Random R = new Random();
    
        private static Db db = Db.use();
    
        public static void main(String[] args) {
    
    
        }
    
        public static void fixData(String year) {
            YEAR = year;
            TABLE_NAME = "region" + YEAR;
            HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + YEAR + "/";
            List<RegionCell> regionCells = queryLostData(db, TABLE_NAME, 1);
            int provinceLost = regionCells.size();
            if (!CollUtil.isEmpty(regionCells)) {
                // 递归查询
                regionCells.forEach(DataFixApplication::readDataRecursive);
            }
    
            regionCells = queryLostData(db, TABLE_NAME, 2);
            int cityLost = regionCells.size();
            if (!CollUtil.isEmpty(regionCells)) {
                // 递归查询
                regionCells.forEach(DataFixApplication::readDataRecursive);
            }
            regionCells = queryLostData(db, TABLE_NAME, 3);
            int countyLost = regionCells.size();
            if (!CollUtil.isEmpty(regionCells)) {
                // 递归查询
                regionCells.forEach(DataFixApplication::readDataRecursive);
            }
    
            regionCells = queryLostData(db, TABLE_NAME, 4);
            int townLost = regionCells.size();
            if (!CollUtil.isEmpty(regionCells)) {
                // 递归查询
                regionCells.forEach(DataFixApplication::readDataRecursive);
            }
            if (provinceLost + cityLost + countyLost + townLost == 0) {
                System.out.println(YEAR + "年数据补完!");
            }
        }
    
    
        private static RegionCell readDataRecursive(RegionCell regionCell) {
            final String regionCellLink = regionCell.getLink();
            if (StringUtils.isEmpty(regionCellLink)) return null;
    
            final HttpRequest httpRequest = HttpUtil
                    .createGet(regionCellLink)
                    .timeout(TIMEOUT)
                    .setConnectionTimeout(TIMEOUT)
                    .setReadTimeout(TIMEOUT);
            if (!CollUtil.isEmpty(regionCell.getCookies())) {
                Map<String, String> headers = new HashMap<>();
                regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";"));
                headers.put(REFERER,  regionCell.getLink());
                headers.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                headers.put("Host", "www.stats.gov.cn");
                headers.put("Upgrade-Insecure-Requests", "1");
                httpRequest.addHeaders(headers);
            }
            final HttpResponse httpResponse = httpRequest.execute();
            // 封装cookie给下一次请求使用
            final RegionCell.RegionCellBuilder builder = RegionCell.builder();
            builder.cookies(httpResponse.getCookies());
            final RegionCell newCell = builder.build();
    
            if (!httpResponse.isOk()) return null;
            final Document document = Jsoup.parse(httpResponse.body());
            readCityData(document, newCell, regionCell);
            readCountyData(document, newCell, regionCell);
            readTownData(document, newCell, regionCell);
            readVillageData(document, newCell, regionCell);
            return null;
        }
    
        private static void readVillageData(Document document, RegionCell newCell, RegionCell superCell) {
            Elements villageTrList = document.getElementsByClass("villagetr");
            if (CollUtil.isEmpty(villageTrList)) return;
            villageTrList.forEach(villageTr -> {
                Elements villageTdList = villageTr.getElementsByTag("td");
    
                Element villageCodeTd = villageTdList.get(0);
                Element villageTypeCodeTd = villageTdList.get(1);
                Element villageNameTd = villageTdList.get(2);
    
                newCell.setCode(villageCodeTd.text());
                newCell.setParentCode(superCell.getCode());
                newCell.setName(villageNameTd.text());
                newCell.setTypeCode(villageTypeCodeTd.text());
                newCell.setGenTime(LocalDateTime.now());
                writeDataToDb(5, newCell);
            });
        }
    
        private static void readTownData(Document document, RegionCell newCell, RegionCell superCell) {
            Elements townTrList = document.getElementsByClass("towntr");
            if (CollUtil.isEmpty(townTrList)) return;
            townTrList.forEach(townTr -> {
    
                Elements townTds = townTr.getElementsByTag("td");
                Element townCodeTd = townTds.get(0);
                Element townNameTd = townTds.get(1);
                Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a");
                Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a");
                // 取城乡代码
                String regionTownCode;
                String townNextHref = null;
    
                boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd);
                if (isTownLinkTag) {
                    regionTownCode = aTagsInTownCodeTd.get(0).text();
                    townNextHref = aTagsInTownCodeTd.get(0).attr("href");
                } else regionTownCode = townCodeTd.text();
                String regionTownName;
                if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text();
                else regionTownName = townNameTd.text();
    
                final String codePrefix = regionTownCode.substring(0, 2);
                boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix);
    
                if (condition1 && isTownLinkTag) {
                    final String link = superCell.getLink();
                    final String basePath = link.substring(0, link.lastIndexOf("/") + 1);
                    String url = basePath + townNextHref;
                    newCell.setLink(url);
                } else if (isTownLinkTag) {
                    int index = townNextHref.indexOf("/");
                    String provincePath = townNextHref.substring(index + 1, index + 3);
                    String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/";
                    String url = HEADER_URL + provincePath + cityPath + townNextHref;
                    newCell.setLink(url);
                }
    
                newCell.setCode(regionTownCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionTownName);
                newCell.setGenTime(LocalDateTime.now());
                writeDataToDb(4, newCell);
                readDataRecursive(newCell);
            });
        }
    
        private static void readCountyData(Document document, RegionCell newCell, RegionCell superCell) {
            Elements countyTrList = document.getElementsByClass("countytr");
            if (CollUtil.isEmpty(countyTrList)) return;
            countyTrList.forEach(countyTr -> {
                Elements countyTds = countyTr.getElementsByTag("td");
                Element countyCodeTd = countyTds.get(0);
                Element countyNameTd = countyTds.get(1);
                Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a");
                Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a");
                // 取城乡代码
                String regionCountyCode;
                String countyNextHref = null;
    
                boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd);
                if (isCountyLinkTag) {
                    regionCountyCode = aTagsInCountyCodeTd.get(0).text();
                    countyNextHref = aTagsInCountyCodeTd.get(0).attr("href");
                } else regionCountyCode = countyCodeTd.text();
    
                String regionCountyName;
                if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text();
                else regionCountyName = countyNameTd.text();
    
                if (isCountyLinkTag) {
                    int index = countyNextHref.indexOf('/');
                    String provincePath = countyNextHref.substring(index + 1, index + 3) + "/";
                    String countyUrl = HEADER_URL + provincePath + countyNextHref;
                    newCell.setLink(countyUrl);
                }
    
                newCell.setCode(regionCountyCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionCountyName);
                newCell.setGenTime(LocalDateTime.now());
                writeDataToDb(3, newCell);
                readDataRecursive(newCell);
            });
        }
    
        private static void readCityData(Document document, RegionCell newCell, RegionCell superCell) {
            final Elements citytrList = document.getElementsByClass("citytr");
            if (CollUtil.isEmpty(citytrList)) return;
            citytrList.forEach(cityTr -> {
                Elements cityTds = cityTr.getElementsByTag("td");
                Element codeTd = cityTds.get(0);
                Element nameTd = cityTds.get(1);
                Elements asTagInCodeTd = codeTd.getElementsByTag("a");
                Elements asTagInNameTd = nameTd.getElementsByTag("a");
    
                String regionCityCode;
                String cityNextHref = null;
    
                boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd);
                if (isLinkTag) {
                    regionCityCode = asTagInCodeTd.get(0).text();
                    cityNextHref = asTagInCodeTd.get(0).attr("href");
                } else
                    regionCityCode = codeTd.text();
                String regionName;
                if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text();
                else regionName = nameTd.text();
    
                if (isLinkTag) newCell.setLink(HEADER_URL + cityNextHref);
    
                newCell.setCode(regionCityCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionName);
                newCell.setGenTime(LocalDateTime.now());
                writeDataToDb(2, newCell);
                readDataRecursive(newCell);
            });
        }
    
        private static void writeDataToDb(int level, RegionCell cell) {
            try {
                db.insertOrUpdate(Entity
                        .create(TABLE_NAME)
                        .set(CODE, cell.getCode())
                        .set(PARENT_CODE, cell.getParentCode())
                        .set(NAME, cell.getName())
                        .set(LEVEL, level)
                        .set(TYPE_CODE, cell.getTypeCode())
                        .set(LINK, cell.getLink())
                        .set(GEN_TIME, cell.getGenTime()),
                        CODE
                );
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    
        /**
         * SELECT 
         *  SUPER.*
         * FROM 
         *  (SELECT * FROM region2021 WHERE `LEVEL` = 4) AS SUPER
         *  LEFT JOIN (SELECT * FROM region2021) AS SUB ON SUPER.CODE = SUB.PARENT_CODE
         *  WHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL
         * @param db
         * @param tableName
         * @param level
         * @return
         */
        @SneakyThrows
        private static List<RegionCell> queryLostData(
            Db db,
            String tableName,
            int level
        ) {
            String sql =
            "SELECT \n" +
            "\tSUPER.*\n" +
            "FROM \n" +
            "\t(SELECT * FROM " + tableName + " WHERE `LEVEL` = ? ) AS SUPER\n" +
            "\tLEFT JOIN (SELECT * FROM " + tableName + " ) AS SUB ON SUPER.CODE = SUB.PARENT_CODE\n" +
            "\tWHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL";
            return db.query(sql, RegionCell.class, level);
        }
    
    }
    

      

     2022年7月9日06点08分更新

    通过写补偿逻辑发现可以进一步优化代码结果:

    1、常量统一存放

    package cn.cloud9.constant;
    
    import java.util.Random;
    
    public interface Constant {
        int TIMEOUT = 3000;
        String PATH_CHAR = "/";
        String CODE = "CODE";
        String PARENT_CODE = "PARENT_CODE";
        String NAME = "NAME";
        String LEVEL = "LEVEL";
        String TYPE_CODE = "TYPE_CODE";
        String LINK = "LINK";
        String GEN_TIME = "GEN_TIME";
        String REFERER = "Referer";
        String USER_AGENT = "User-Agent";
        String[] BROWSER_AGENTS = {
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
        };
        String ROOT_PATH = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/${YEAR}/";
        Random R = new Random();
    }
    

      

    2、调用的方法封装在Util中

    package cn.cloud9.util;
    
    import cn.cloud9.constant.Constant;
    import cn.cloud9.po.RegionCell;
    import cn.hutool.core.collection.CollUtil;
    import cn.hutool.db.Db;
    import cn.hutool.db.Entity;
    import cn.hutool.http.HttpRequest;
    import cn.hutool.http.HttpResponse;
    import cn.hutool.http.HttpUtil;
    import com.alibaba.druid.util.StringUtils;
    import lombok.SneakyThrows;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.sql.SQLException;
    import java.time.LocalDateTime;
    import java.util.HashMap;
    import java.util.Map;
    
    import static cn.cloud9.constant.Constant.*;
    
    /**
     * @author OnCloud9
     * @description
     * @project RegionReptile-Remaster
     * @date 2022年07月07日 下午 10:01
     */
    public class MyUtil {
        private static Db db = Db.use();
        /**
         * 分配新的请求头Header
         * @return
         */
        public static Map<String, String> getNewRequestHeader() {
            return new HashMap<String, String>(){{
                this.put("Host", "www.stats.gov.cn");
                this.put("Upgrade-Insecure-Requests", "1");
            }};
        }
    
        /**
         * 初始化表空间
         * @param tableName
         */
        @SneakyThrows
        public static void initialTableSpace(String tableName) {
            String SQL =
                "CREATE TABLE IF NOT EXISTS "+ tableName +" (\n" +
                "  `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码',\n" +
                "  `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码',\n" +
                "  `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称',\n" +
                "  `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT '网页地址',\n" +
                "  `LEVEL` int DEFAULT NULL COMMENT '层级',\n" +
                "  `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型',\n" +
                "  `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间',\n" +
                "  PRIMARY KEY (`CODE`) USING BTREE, \n" +
                "  KEY `IDX_LEVEL` (`LEVEL`) USING BTREE,\n" +
                "  KEY `IDX_PC` (`PARENT_CODE`) USING BTREE \n" +
                ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;";
            db.execute(SQL, null);
        }
    
        /**
         * 连接重试
         * @param httpRequest
         * @return
         */
        public static HttpResponse retryConn(HttpRequest httpRequest) {
            HttpResponse httpResponse = null;
            try {
                httpResponse = httpRequest
                    .timeout(TIMEOUT)
                    .setConnectionTimeout(TIMEOUT)
                    .setReadTimeout(TIMEOUT)
                    .execute();
            } catch (Exception exception) {
                exception.printStackTrace();
                return retryConn(httpRequest);
            }
            return httpResponse;
        }
    
        /**
         * 写入操作封装, 不更新,
         * 有CODE主键发生冲突,异常后执行下一个
         * @param tableName
         * @param cell
         */
        @SneakyThrows
        public static void writeDataToDb(String tableName, RegionCell cell) {
            db.insert(Entity
                .create(tableName)
                .set(CODE, cell.getCode())
                .set(PARENT_CODE, cell.getParentCode())
                .set(NAME, cell.getName())
                .set(LEVEL, cell.getLevel())
                .set(TYPE_CODE, cell.getTypeCode())
                .set(LINK, cell.getLink())
                .set(GEN_TIME, cell.getGenTime()));
        }
    
        /**
         * 村,居委会读取
         * @param document
         * @param newCell
         * @param superCell
         * @param tableName
         */
        public static void readVillage(
            Document document,
            RegionCell newCell,
            RegionCell superCell,
            String tableName
        ) {
            Elements villageTrList = document.getElementsByClass("villagetr");
            if (CollUtil.isEmpty(villageTrList)) return;
            villageTrList.forEach(villageTr -> {
                Elements villageTdList = villageTr.getElementsByTag("td");
                newCell.setCode(villageTdList.get(0).text());
                newCell.setParentCode(superCell.getCode());
                newCell.setName(villageTdList.get(2).text());
                newCell.setTypeCode(villageTdList.get(1).text());
                newCell.setLevel(5);
                newCell.setGenTime(LocalDateTime.now());
                writeDataToDb(tableName, newCell);
            });
        }
    
        /**
         * 街道,镇 读取
         * @param BASE_URL
         * @param document
         * @param newCell
         * @param superCell
         * @param tableName
         */
        public static void readTown (
            String BASE_URL,
            Document document,
            RegionCell newCell,
            RegionCell superCell,
            String tableName
        ) {
            Elements townTrList = document.getElementsByClass("towntr");
            if (CollUtil.isEmpty(townTrList)) return;
            townTrList.forEach(townTr -> {
    
                Elements townTds = townTr.getElementsByTag("td");
                Element townCodeTd = townTds.get(0);
                Element townNameTd = townTds.get(1);
                Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a");
                Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a");
                // 取城乡代码
                String regionTownCode;
                String townNextHref = null;
    
                boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd);
                if (isTownLinkTag) {
                    regionTownCode = aTagsInTownCodeTd.get(0).text();
                    townNextHref = aTagsInTownCodeTd.get(0).attr("href");
                } else regionTownCode = townCodeTd.text();
                String regionTownName;
                if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text();
                else regionTownName = townNameTd.text();
    
                final String codePrefix = regionTownCode.substring(0, 2);
                boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix);
    
                if (condition1 && isTownLinkTag) {
                    final String link = superCell.getLink();
                    final String basePath = link.substring(0, link.lastIndexOf(Constant.PATH_CHAR) + 1);
                    String url = basePath + townNextHref;
                    newCell.setLink(url);
                } else if (isTownLinkTag) {
                    int index = townNextHref.indexOf(Constant.PATH_CHAR);
                    String provincePath = townNextHref.substring(index + 1, index + 3);
                    String cityPath = Constant.PATH_CHAR + townNextHref.substring(index + 3, index + 5) + Constant.PATH_CHAR;
                    String url = BASE_URL + provincePath + cityPath + townNextHref;
                    newCell.setLink(url);
                }
    
                newCell.setCode(regionTownCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionTownName);
                newCell.setGenTime(LocalDateTime.now());
                newCell.setLevel(4);
                writeDataToDb(tableName, newCell);
                readDataRecursive(newCell, BASE_URL, tableName);
            });
        }
    
        /**
         * 区县读取
         * @param BASE_URL
         * @param document
         * @param newCell
         * @param superCell
         * @param tableName
         */
        public static void readCounty (
            String BASE_URL,
            Document document,
            RegionCell newCell,
            RegionCell superCell,
            String tableName
        ) {
            Elements countyTrList = document.getElementsByClass("countytr");
            if (CollUtil.isEmpty(countyTrList)) return;
            countyTrList.forEach(countyTr -> {
                Elements countyTds = countyTr.getElementsByTag("td");
                Element countyCodeTd = countyTds.get(0);
                Element countyNameTd = countyTds.get(1);
                Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a");
                Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a");
                // 取城乡代码
                String regionCountyCode;
                String countyNextHref = null;
    
                boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd);
                if (isCountyLinkTag) {
                    regionCountyCode = aTagsInCountyCodeTd.get(0).text();
                    countyNextHref = aTagsInCountyCodeTd.get(0).attr("href");
                } else regionCountyCode = countyCodeTd.text();
    
                String regionCountyName;
                if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text();
                else regionCountyName = countyNameTd.text();
    
                if (isCountyLinkTag) {
                    int index = countyNextHref.indexOf(Constant.PATH_CHAR);
                    String provincePath = countyNextHref.substring(index + 1, index + 3) + Constant.PATH_CHAR;
                    String countyUrl = BASE_URL + provincePath + countyNextHref;
                    newCell.setLink(countyUrl);
                }
    
                newCell.setCode(regionCountyCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionCountyName);
                newCell.setGenTime(LocalDateTime.now());
                newCell.setLevel(3);
                writeDataToDb(tableName, newCell);
                readDataRecursive(newCell, BASE_URL, tableName);
            });
        }
    
        /**
         * 城市读取
         * @param BASE_URL
         * @param document
         * @param newCell
         * @param superCell
         * @param tableName
         */
        public static void readCity (
                String BASE_URL,
                Document document,
                RegionCell newCell,
                RegionCell superCell,
                String tableName
        ) {
            final Elements citytrList = document.getElementsByClass("citytr");
            if (CollUtil.isEmpty(citytrList)) return;
            citytrList.forEach(cityTr -> {
                Elements cityTds = cityTr.getElementsByTag("td");
                Element codeTd = cityTds.get(0);
                Element nameTd = cityTds.get(1);
                Elements asTagInCodeTd = codeTd.getElementsByTag("a");
                Elements asTagInNameTd = nameTd.getElementsByTag("a");
    
                String regionCityCode;
                String cityNextHref = null;
    
                boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd);
                if (isLinkTag) {
                    regionCityCode = asTagInCodeTd.get(0).text();
                    cityNextHref = asTagInCodeTd.get(0).attr("href");
                } else
                    regionCityCode = codeTd.text();
                String regionName;
                if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text();
                else regionName = nameTd.text();
    
                if (isLinkTag) newCell.setLink(BASE_URL + cityNextHref);
    
                newCell.setCode(regionCityCode);
                newCell.setParentCode(superCell.getCode());
                newCell.setName(regionName);
                newCell.setGenTime(LocalDateTime.now());
                newCell.setLevel(2);
                writeDataToDb(tableName, newCell);
                readDataRecursive(newCell, BASE_URL, tableName);
            });
        }
    
        /**
         * 递归请求调用
         * @param regionCell
         * @param BASE_URL
         * @param tableName
         */
        @SneakyThrows
        public static void readDataRecursive(RegionCell regionCell, String BASE_URL, String tableName) {
            final String regionCellLink = regionCell.getLink();
            if (StringUtils.isEmpty(regionCellLink)) return;
    
            final HttpRequest httpRequest = HttpUtil
                    .createGet(regionCellLink)
                    .timeout(TIMEOUT)
                    .setConnectionTimeout(TIMEOUT)
                    .setReadTimeout(TIMEOUT);
            if (!CollUtil.isEmpty(regionCell.getCookies())) {
                final Map<String, String> headers = MyUtil.getNewRequestHeader();
                regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";"));
                headers.put(REFERER,  regionCell.getLink());
                headers.put(USER_AGENT, BROWSER_AGENTS[Constant.R.nextInt(BROWSER_AGENTS.length)]);
                headers.put("Host", "www.stats.gov.cn");
                headers.put("Upgrade-Insecure-Requests", "1");
                httpRequest.addHeaders(headers);
            }
    
            final HttpResponse httpResponse = retryConn(httpRequest);
            // 封装cookie给下一次请求使用
            final RegionCell.RegionCellBuilder builder = RegionCell.builder();
            builder.cookies(httpResponse.getCookies());
            final RegionCell newCell = builder.build();
    
            if (!httpResponse.isOk()) {
                System.out.println(httpResponse.body());
                return;
            }
            final Document document = Jsoup.parse(httpResponse.body());
            readCity(BASE_URL, document, newCell, regionCell, tableName);
            readCounty(BASE_URL, document, newCell, regionCell, tableName);
            readTown(BASE_URL, document, newCell, regionCell, tableName);
            readVillage(document, newCell, regionCell, tableName);
            return;
        }
    
        /**
         * 读取省份数据
         * @param BASE_URL
         * @param tableName
         */
        public static void readProvinceData(String BASE_URL, String tableName) {
            final String s = HttpUtil.get(BASE_URL);
            final Elements provincetrs = Jsoup.parse(s).getElementsByClass("provincetr");
            provincetrs.forEach(tr -> {
                Elements provinceas = tr.getElementsByTag("a");
                provinceas.forEach(a -> {
                    RegionCell cell = RegionCell.builder()
                        .name(a.text())
                        .code( a.attr("href").replace(".html", ""))
                        .link(BASE_URL + a.attr("href"))
                        .parentCode(String.valueOf(0))
                        .genTime(LocalDateTime.now())
                        .level(1)
                        .build();
                    MyUtil.writeDataToDb(tableName, cell);
                    readDataRecursive(cell, BASE_URL, tableName);
                });
            });
        }
    
    }
    

      

    启动类就不用写那么多东西了

    package cn.cloud9;
    
    import cn.cloud9.constant.Constant;
    import cn.cloud9.util.MyUtil;
    
    /**
     * @author OnCloud9
     * @description
     * @project RegionReptile-Remaster
     * @date 2022年07月07日 下午 09:35
     */
    public class MainApplication {
    
        public static void main(String[] args) {
            final String year = 0 == args.length ? "2009" : args[0];
            String tableName = "region" + year;
            String BASE_URL = Constant.ROOT_PATH.replace("${YEAR}", year);
            MyUtil.initialTableSpace(tableName);
            MyUtil.readProvinceData(BASE_URL, tableName);
        }
    }
    

      

  • 相关阅读:
    js中setTimeout、setInterval、 clearInterval方法简介
    分享一个VS2008漂亮的黑色主题
    最简单的设计模式
    记一次查数据的需求
    Oracle常用存储过程写法
    关于域名解析
    使用PHP打造QQ空间神奇图片
    自制小工具含源码——SPTC上海交通卡余额查询
    自制小工具含源码——博客园图床ImageBed
    不可不知的mysql 常用技巧总结
  • 原文地址:https://www.cnblogs.com/mindzone/p/16427175.html
Copyright © 2020-2023  润新知