• 基于【 springBoot+jsoup】一 || 爬取全国行政区划数据


    一、代码演示

    如果中途中断,可进行刷选过滤已拉取省份数据

    /**
     * TODO
     *
     * @author kevin
     * @createTime 2019-11-18 19:37
     */
    @RestController
    public class CityController {
    
        @Autowired
        private ProvinceService provinceService;
        @Autowired
        private HttpUtil httpUtil;
        private String yearHref = "";
        private int index;
    
        // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
        @GetMapping("/start")
        public ResultTemplate<String> spider() throws Exception {
            String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
            String charset = "gb2312";
            Document rootDoc = httpUtil.get(url, charset);
    
            if (rootDoc == null) {
                return of("fail");
            }
            Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
            // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
            yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
            Document doc = httpUtil.get(yearHref, charset);
            // 遍历所有的省
            Elements provinceElements = doc.getElementsByClass("provincetr");
            for (Element element : provinceElements) {
                Elements aEles = element.select("a");
                for (Element aEle : aEles) {
                    String name = aEle.text();
                    // 11.html
                    String provincesHref = aEle.attr("href");
                    String code = provincesHref.substring(0, provincesHref.indexOf("."));
                    index = yearHref.lastIndexOf("/") + 1;
                    // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
                    provincesHref = yearHref.substring(0, index) + provincesHref;
                    DicProvince province = new DicProvince()
                            .setProvinceName(name)
                            .setProvinceCode(code)
                            .setCountryId(1196612453660643329L)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
                        System.out.println("未执行市:" + name);
                    } else {
                        System.out.println("开始时间:" + LocalDateTime.now());
                        System.out.println("省名称:" + name);
                        Long id = provinceService.insertProvince(province);
                        getCites(provincesHref, charset, id);
                    }
                }
            }
            return of("spider crawl end.");
        }
    
        private void getCites(String url, String charset, Long provinceId) throws Exception {
            Document rootDoc = null;
            int i = 0;
            while (rootDoc == null) {
                try {
                    i++;
                    if (i >= 3) {
                        System.out.println("循环次数:" + i);
                    }
                    rootDoc = httpUtil.get(url, charset);
                } catch (Exception e) {
                    rootDoc = null;
                    System.out.println("请求网页链接报错");
                }
            }
            i = 0;
            if (rootDoc != null) {
                Elements cityElements = rootDoc.getElementsByClass("citytr");
                for (Element cityElement : cityElements) {
                    Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
                    String name = aEle.text();
                    // 11/1101.html
                    String cityHref = aEle.attr("href");
                    int start = cityHref.lastIndexOf("/") + 1;
                    String code = cityHref.substring(start, cityHref.indexOf("."));
                    cityHref = yearHref.substring(0, index) + cityHref;
                    DicCity city = new DicCity()
                            .setCityName(name)
                            .setCityCode(code)
                            .setProvinceId(provinceId)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertCity(city);
                    //Long id=1L;
    
                    getDistrict(cityHref, charset, id);
                }
            }
        }
    
        // 区县
        private void getDistrict(String url, String charset, Long idDis) throws Exception {
            Document rootDoc = null;
            int i = 0;
            while (rootDoc == null) {
                try {
                    i++;
                    if (i >= 3) {
                        System.out.println("循环次数:" + i);
                    }
                    rootDoc = httpUtil.get(url, charset);
                } catch (Exception e) {
                    rootDoc = null;
                    System.out.println("请求网页链接报错");
                }
            }
            i = 0;
            if (rootDoc != null) {
                Elements cityElements = rootDoc.getElementsByClass("countytr");
                for (Element cityElement : cityElements) {
                    try {
                        Element aEle = cityElement.select("a").get(1);
                        String name = aEle.text();
                        String cityHref = aEle.attr("href");
                        int start = cityHref.lastIndexOf("/") + 1;
                        String code = cityHref.substring(start, cityHref.indexOf("."));
    
                        int index = url.lastIndexOf("/") + 1;
                        cityHref = url.substring(0, index) + cityHref;
    
                        DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
                                .setCreateDate(LocalDateTime.now())
                                .setCreateUserid(1L)
                                .setCreateUsername("admin");
                        Long id = provinceService.insertDistrict(district);
                        //Long id=1L;
                        getStreet(cityHref, charset, id);
                    } catch (Exception e) {
                        System.out.println("市辖区");
                        Element aEle = cityElement.select("td").get(0);
                        String code = aEle.text();
    
                        Element aEle2 = cityElement.select("td").get(1);
                        String name = aEle2.text();
    
                        DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
                        Long id = provinceService.insertDistrict(district);
                        System.out.println("执行完毕");
    
                    }
    
                }
            }
        }
    
        // 街道
        private void getStreet(String url, String charset, Long idStr) throws Exception {
            Document rootDoc = null;
            int i = 0;
            while (rootDoc == null) {
                try {
                    i++;
                    if (i >= 3) {
                        System.out.println("循环次数:" + i);
                    }
                    rootDoc = httpUtil.get(url, charset);
                } catch (Exception e) {
                    rootDoc = null;
                    System.out.println("请求网页链接报错");
                }
            }
            i = 0;
            if (rootDoc != null) {
                Elements cityElements = rootDoc.getElementsByClass("towntr");
                for (Element cityElement : cityElements) {
                    Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
                    String name = aEle.text();
                    String cityHref = aEle.attr("href");
                    int start = cityHref.lastIndexOf("/") + 1;
                    String code = cityHref.substring(start, cityHref.indexOf("."));
                    int index = url.lastIndexOf("/") + 1;
                    cityHref = url.substring(0, index) + cityHref;
                    DicStreet street = new DicStreet()
                            .setStreetName(name)
                            .setStreetCode(code)
                            .setDistrictId(idStr)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertStreet(street);
                    //Long id=1L;
                    getCommunity(cityHref, charset, id);
                }
            }
        }
    
        // 社区
        private void getCommunity(String url, String charset, Long idPro) throws Exception {
            Document rootDoc = null;
            int i = 0;
            while (rootDoc == null) {
                try {
                    i++;
                    if (i >= 3) {
                        System.out.println("循环次数:" + i);
                    }
                    rootDoc = httpUtil.get(url, charset);
                } catch (Exception e) {
                    rootDoc = null;
                    System.out.println("请求网页链接报错");
                }
            }
            i = 0;
            if (rootDoc != null) {
                Elements cityElements = rootDoc.getElementsByClass("villagetr");
                for (Element cityElement : cityElements) {
                    Element aEle = cityElement.select("td").get(0);
                    String code = aEle.text();
    
                    Element aEle2 = cityElement.select("td").get(1);
                    String cl_code = aEle2.text();
    
                    Element aEle3 = cityElement.select("td").get(2);
                    String name = aEle3.text();
    
                    DicCommunity community = new DicCommunity()
                            .setCommunityName(name)
                            .setCommunityCode(code)
                            .setClassificationCode(cl_code)
                            .setStreetId(idPro)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertCommunity(community);
                }
            }
        }
    
    }

    二、HttppUtil工具类

    /**
     * TODO
     *
     * @author kevin
     * @createTime 2019-11-20 9:17
     */
    @Component
    public class HttpUtil {
        public Document get(String url, String charset) throws IOException {
            String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
            URL url2 = new URL(url);
            HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
            connection.setRequestMethod("GET");
            //是否允许缓存,默认true。
            connection.setUseCaches(Boolean.FALSE);
            //设置请求头信息
            connection.addRequestProperty("Connection", "close");
            connection.addRequestProperty("user-agent", userAgent);
            //设置连接主机超时(单位:毫秒)
            connection.setConnectTimeout(80000);
            //设置从主机读取数据超时(单位:毫秒)
            connection.setReadTimeout(80000);
            //开始请求
            try {
                Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
                return doc;
            } catch (Exception e) {
                System.out.println("parse error: " + url);
            }
            return null;
        }
    
    }

    三、service部分,根据需要自行定义数据库表

    /**
     * TODO
     *
     * @author kevin
     * @createTime 2019-11-18 20:41
     */
    @Service
    public class ProvinceServiceImpl implements ProvinceService {
    
        @Autowired
        private ProvinceMapper provinceMapper;
        @Autowired
        private CityMapper cityMapper;
        @Autowired
        private DistrictMapper districtMapper;
        @Autowired
        private StreetMapper streetMapper;
        @Autowired
        private CommunityMapper communityMapper;
    
    
        @Override
        public Long insertProvince(DicProvince dicProvince) {
            int res=0;
            while (res!=1){
                try {
                    res=provinceMapper.insert(dicProvince);
                } catch (Exception e) {
                    res=0;
                    System.out.println("插入省数据失败");
                    e.printStackTrace();
                }
            }
            return dicProvince.getProvinceId();
        }
    
        @Override
        public Long insertCity(DicCity dicCity) {
            int res=0;
            while(res!=1){
                try {
                    res=cityMapper.insert(dicCity);
                } catch (Exception e) {
                    res=0;
                    System.out.println("插入市数据失败");
                    e.printStackTrace();
                }
            }
            return dicCity.getCityId();
        }
    
    
        @Override
        public Long insertDistrict(DicDistrict dicDistrict) {
            int res=0;
            while (res!=1){
                try {
                    res=districtMapper.insert(dicDistrict);
                } catch (Exception e) {
                    res=0;
                    System.out.println("插入区县数据失败");
                    e.printStackTrace();
                }
            }
            return dicDistrict.getDistrictId();
        }
    
        @Override
        public Long insertStreet(DicStreet dicStreet) {
            int res=0;
            while (res!=1){
                try {
                    res=streetMapper.insert(dicStreet);
                } catch (Exception e) {
                    res=0;
                    System.out.println("插入街道数据失败");
                    e.printStackTrace();
                }
            }
            return dicStreet.getStreetId();
        }
    
        @Override
        public Long insertCommunity(DicCommunity dicCommunity) {
            int res=0;
            while (res!=1){
                try {
                    res=communityMapper.insert(dicCommunity);
                } catch (Exception e) {
                    res=0;
                    System.out.println("插入社区数据失败");
                    e.printStackTrace();
                }
            }
            return dicCommunity.getCommunityId();
        }
    
    
    }
    

      

  • 相关阅读:
    Excel导出采用mvc的ExcelResult继承遇到的问题Npoi导出
    Excel导出采用mvc的ExcelResult继承遇到的问题
    word模板导出的几种方式:第三种:标签替换(DocX组件读取与写入Word)
    word模板导出的几种方式:第二种:C#通过模板导出Word(文字,表格,图片) 占位符替换
    word模板导出的几种方式:第一种:占位符替换模板导出(只适用于word中含有表格形式的)
    vue 学习链接地址
    创建作业(JOB)
    html5 浏览文件
    Guava monitor
    Spring Rabbitmq HelloWorld实例
  • 原文地址:https://www.cnblogs.com/kevin-ying/p/11925782.html
Copyright © 2020-2023  润新知