• Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句


    多不废话,直接上马,小主您稳着。。。

      1 package com.fanfengping.zeus.uitl;
      2 
      3 import com.alibaba.fastjson.JSONObject;
      4 import lombok.extern.slf4j.Slf4j;
      5 import org.jsoup.Jsoup;
      6 import org.jsoup.nodes.Document;
      7 import org.jsoup.nodes.Element;
      8 import org.jsoup.select.Elements;
      9 import org.testng.annotations.Test;
     10 
     11 import java.io.File;
     12 import java.io.FileWriter;
     13 import java.util.ArrayList;
     14 import java.util.HashMap;
     15 import java.util.List;
     16 import java.util.Map;
     17 
     18 @Slf4j
     19 public class JsoupGetRegionSql {
     20     @Test
     21     public void getRegionSql () throws Exception {
     22         String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/201901-06/201904301706.html";
     23         String fp = System.getProperty("user.dir") + File.separator + "initRegion.sql";
     24 
     25         int count = 0;
     26 
     27         File file = new File(fp);
     28 
     29         if (file.exists()) {
     30             file.delete();
     31         }
     32 
     33         file.createNewFile();
     34 
     35         FileWriter fileWriter = new FileWriter(file.getName(), true);
     36 
     37         Document doc = Jsoup.connect(url)
     38                 .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36")
     39                 .header("Accept", "text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,application/signed-exchangev=b3")
     40                 .maxBodySize(0)
     41                 .timeout(100000)
     42                 .get();
     43 
     44         Elements trs = doc.select("tr");
     45 
     46         List<Map<String, Object>> adminRegion = new ArrayList<>();
     47         List<Map<String, Object>> adminRegionSec = new ArrayList<>();
     48         List<Map<String, Object>> adminRegionThi = new ArrayList<>();
     49 
     50 
     51         for (Element tr : trs ) {
     52             Elements tds = tr.select("td");
     53 
     54             Map<String, Object> region = new HashMap<>();
     55 
     56             if (tds.size() > 3) {
     57                 String regionCode = tds.get(1).text();
     58                 String regionArea = tds.get(2).text();
     59                 String parentCode = "";
     60 
     61                 if (validCode(regionCode)) {
     62                     int leveType = 2;
     63                     parentCode = regionCode.substring(0,2) + "0000";
     64 
     65                     if (!regionCode.endsWith("00")) {
     66                         leveType = 3;
     67                         parentCode = regionCode.substring(0,4) + "00";
     68                     }
     69 
     70                     if (regionCode.endsWith("0000")) {
     71                         leveType = 1;
     72                         parentCode = "000000";
     73                     }
     74 
     75                     region.put("code", regionCode);
     76                     region.put("region", regionArea);
     77                     region.put("parentCode", parentCode);
     78                     region.put("level", leveType);
     79 
     80                     switch ((Integer) region.get("level")) {
     81                         case 1:
     82                             adminRegion.add(region);
     83                             break;
     84                         case 2:
     85                             adminRegionSec.add(region);
     86                             break;
     87                         default:
     88                             adminRegionThi.add(region);
     89                             break;
     90                     }
     91 
     92                     count++;
     93                     String content = String.format("insert into region_code (code, region, level, parent_code, dtime, note, ctime)" +
     94                             " values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode);
     95 
     96                     fileWriter.write(content);
     97                 }
     98             }
     99         }
    100 
    101         System.out.println("总数量:" + count);
    102 
    103         System.out.println(fp);
    104 
    105         fileWriter.close();
    106     }
    107 
    108     public boolean validCode(String code) {
    109         try {
    110             Integer.parseInt(code);
    111             return true;
    112         } catch (Exception e) {
    113             return false;
    114         }
    115     }
    116 }

      

      控制台输出如下所示:

      

      

      数据库文件截图如下所示:

      

      

      

     

     

  • 相关阅读:
    多线程:多线程设计模式(一):总体介绍
    javascript:12种JavaScript MVC框架之比较
    mysql 查询死锁语句
    charles 抓包工具破解方法
    java 自定义log类
    git统计日期之间的代码改动行数
    mac/linux自带定时任务执行crontab的使用
    python MD5步骤
    python 操作excel读写
    python logger日志工具类
  • 原文地址:https://www.cnblogs.com/fengpingfan/p/10903440.html
Copyright © 2020-2023  润新知