• 从国家统计局爬下来的地区信息


    发现地区编码网上流传了很多版本。有很多崇文区,玄武区之类的。于是想了想,还是自己做一份。不敢保证没问题,但还没遇到问题。

    首先,从网上找到一个大神写的jsoup的例子,修改成自己想要的格式,在代码无价的年代,原谅我的抄袭,研究是份任重而道远的任务。

    1.jsoup代码:

      1 package com.test;
      2 
      3 import java.io.BufferedWriter;
      4 import java.io.File;
      5 import java.io.FileWriter;
      6 import java.io.IOException;
      7 import java.util.HashMap;
      8 import java.util.Map;
      9 import java.util.Random;
     10 
     11 import org.apache.log4j.Logger;
     12 import org.jsoup.Jsoup;
     13 import org.jsoup.nodes.Document;
     14 import org.jsoup.nodes.Element;
     15 import org.jsoup.select.Elements;
     16 import org.junit.Test;
     17 
     18 /**
     19  * 全国省市县镇村数据爬取
     20  * 
     21  * @author liushaofeng
     22  * @date 2015-10-11 上午12:19:39
     23  * @version 1.0.0
     24  */
     25 public class JsoupProviceTest {
     26     private static Map<Integer, String> cssMap = new HashMap<Integer, String>();
     27 
     28     static {
     29         cssMap.put(1, "provincetr");//
     30         cssMap.put(2, "citytr");//
     31         cssMap.put(3, "countytr");//
     32         cssMap.put(4, "towntr");//
     33         cssMap.put(5, "villagetr");//
     34     }
     35 
     36     public static void main(String[] args) throws IOException {
     37         int level = 1;
     38 //        TestConDataBase.initDataBase();
     39 
     40         // 获取全国各个省级信息
     41         Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
     42         Elements rowProvince = connect.select("tr." + cssMap.get(level));
     43         for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
     44         {
     45             Elements select = provinceElement.select("a");
     46             for (Element province : select)// 每一个省份(四川省)
     47             {
     48                 String href = province.attr("href");
     49                 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
     50                 System.out.println(level+1+","+procode+","+province.text());
     51 //                SysZone zone = new SysZone();
     52 //                try {
     53 //                    zone.set("zoneLevel", level+1)
     54 //                            .set("zoneCode", procode.trim())
     55 //                            .set("parentCode", "000000000000")
     56 //                            .set("zoneName", province.text())
     57 //                            .save();
     58 //                } catch (Exception e1) {
     59 //                    // TODO Auto-generated catch block
     60 //                    e1.printStackTrace();
     61 //                    
     62 //                }
     63                 
     64                 parseNextLevel(province, level + 1,procode);
     65             }
     66         }
     67 //        for (int i = 3; i < rowProvince.size(); i++) {
     68 //            Element provinceElement = rowProvince.get(i);
     69 //            Elements select = provinceElement.select("a");
     70 //            for (int j = 2; j < select.size(); j++) {
     71 //                Element province = select.get(j);
     72 //                System.out.println(province.text());
     73 //                parseNextLevel(province, level + 1);
     74 //            }
     75 //        }
     76     }
     77     
     78     @Test
     79     public void testa(){
     80         // 获取全国各个省级信息
     81         Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
     82         Elements rowProvince = connect.select("tr." + cssMap.get(1));
     83         for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
     84         {
     85             Elements select = provinceElement.select("a");
     86             for (Element province : select)// 每一个省份(四川省)
     87             {
     88                 printProvince(province);
     89             }
     90         }
     91     }
     92 
     93     private static void parseNextLevel(Element parentElement, int level, String parent)
     94             throws IOException {
     95         try {
     96             Thread.sleep(500);
     97         } catch (InterruptedException e) {
     98             e.printStackTrace();
     99         }
    100 
    101         Document doc = connect(parentElement.attr("abs:href"));
    102         if(doc==null){
    103             doc = connect(parentElement.attr("abs:href"));
    104         }
    105         Elements newsHeadlines = doc.select("tr." + cssMap.get(level));//
    106         // 获取表格的一行数据
    107         for (Element element : newsHeadlines) {
    108             String parents = printInfo(element, level + 1,parent);
    109             Elements select = element.select("a");
    110             if (select.size() != 0) {
    111                 parseNextLevel(select.last(), level + 1,parents);
    112             }
    113         }
    114     }
    115 
    116     private static void printProvince(Element province){
    117         BufferedWriter bufferedWriter = null;
    118         try {
    119             bufferedWriter = new BufferedWriter(new FileWriter(new File(
    120                     "F:\provinces.txt"), true));
    121             String pro =  province.text();
    122             String href = province.attr("href");
    123             String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
    124             System.out.println(2+","+procode+",000000000000,"+pro);
    125             bufferedWriter.write(2+","+procode+",000000000000,"+pro);
    126             bufferedWriter.newLine();
    127             bufferedWriter.flush();
    128         } catch (IOException e) {
    129             e.printStackTrace();
    130         } finally {
    131             if (bufferedWriter != null) {
    132                 try {
    133                     bufferedWriter.close();
    134                 } catch (IOException e) {
    135                     e.printStackTrace();
    136                 }
    137                 bufferedWriter = null;
    138             }
    139         }
    140     }
    141     
    142     private static  String printInfo(Element element, int level,String parent) {
    143         BufferedWriter bufferedWriter = null;
    144         String code = "";
    145         code = element.select("td").first().text();
    146         String name =element.select("td").last().text();
    147         String str =  level + "," + code + ","+parent+","
    148                         + name;
    149     
    150         System.out.println(str);
    151 //        SysZone zone = new SysZone();
    152 //        try {
    153 //            zone.set("zoneLevel", level)
    154 //                    .set("zoneCode", code.trim())
    155 //                    .set("parentCode", parent.trim())
    156 //                    .set("zoneName", name.trim())
    157 //                    .save();
    158 //        } catch (Exception e1) {
    159 //            // TODO Auto-generated catch block
    160 //            e1.printStackTrace();
    161 //            
    162 //        }
    163         try {
    164             bufferedWriter = new BufferedWriter(new FileWriter(new File(
    165                     "F:\AllCity.txt"), true));
    166             
    167             bufferedWriter.write(str);
    168             bufferedWriter.newLine();
    169             bufferedWriter.flush();
    170         } catch (IOException e) {
    171             e.printStackTrace();
    172         } finally {
    173             if (bufferedWriter != null) {
    174                 try {
    175                     bufferedWriter.close();
    176                 } catch (IOException e) {
    177                     e.printStackTrace();
    178                 }
    179                 bufferedWriter = null;
    180             }
    181         }
    182         
    183         return  code;
    184     }
    185 
    186     private static Document connect(String url) {
    187         if (url == null || url.isEmpty()) {
    188             throw new IllegalArgumentException("The input url('" + url
    189                     + "') is invalid!");
    190         }
    191         String [] b = {
    192                    "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
    193                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    194                    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    195                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    196                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    197                    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    198                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    199                    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    200                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    201                    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    202                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    203                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    204                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    205                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    206                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    207                    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    208                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    209                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    210                    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    211     };
    212         try {
    213             
    214             Random rand = new Random();
    215             return Jsoup.connect(url) 
    216             .header("User-Agent",b[rand.nextInt(19)])
    217             .timeout(90 * 1000).get();
    218         } catch (IOException e) {
    219             e.printStackTrace();
    220         }
    221         return null;
    222     }
    223 }
    View Code

    2.可以选择从文本读取后写入数据库,也可直接写入。

    3.最终,我生成了两份,一个是省市县三级的,一个是所有的。

    省市县中去掉了市辖区等无关代码。

    4.爬虫源码:包括数据库保存。下载源码

    5.省市县t:下载txt,  下载sql

    6.所有地区,港澳台除外txt,  下载sql

  • 相关阅读:
    图书管理系统---基于form组件和modelform改造添加和编辑
    Keepalived和Heartbeat
    SCAN IP 解释
    Configure Active DataGuard and DG BROKER
    Oracle 11gR2
    我在管理工作中積累的九種最重要的領導力 (李開復)
    公募基金公司超融合基础架构与同城灾备建设实践
    Oracle 11g RAC for LINUX rhel 6.X silent install(静默安装)
    11gR2 静默安装RAC 集群和数据库软件
    Setting Up Oracle GoldenGate 12
  • 原文地址:https://www.cnblogs.com/woshimrf/p/4943382.html
Copyright © 2020-2023  润新知