• 从国家统计局爬下来的地区信息


    发现地区编码网上流传了很多版本。有很多崇文区,玄武区之类的。于是想了想,还是自己做一份。不敢保证没问题,但还没遇到问题。

    首先,从网上找到一个大神写的jsoup的例子,修改成自己想要的格式,在代码无价的年代,原谅我的抄袭,研究是份任重而道远的任务。

    1.jsoup代码:

      1 package com.test;
      2 
      3 import java.io.BufferedWriter;
      4 import java.io.File;
      5 import java.io.FileWriter;
      6 import java.io.IOException;
      7 import java.util.HashMap;
      8 import java.util.Map;
      9 import java.util.Random;
     10 
     11 import org.apache.log4j.Logger;
     12 import org.jsoup.Jsoup;
     13 import org.jsoup.nodes.Document;
     14 import org.jsoup.nodes.Element;
     15 import org.jsoup.select.Elements;
     16 import org.junit.Test;
     17 
     18 /**
     19  * 全国省市县镇村数据爬取
     20  * 
     21  * @author liushaofeng
     22  * @date 2015-10-11 上午12:19:39
     23  * @version 1.0.0
     24  */
     25 public class JsoupProviceTest {
     26     private static Map<Integer, String> cssMap = new HashMap<Integer, String>();
     27 
     28     static {
     29         cssMap.put(1, "provincetr");//
     30         cssMap.put(2, "citytr");//
     31         cssMap.put(3, "countytr");//
     32         cssMap.put(4, "towntr");//
     33         cssMap.put(5, "villagetr");//
     34     }
     35 
     36     public static void main(String[] args) throws IOException {
     37         int level = 1;
     38 //        TestConDataBase.initDataBase();
     39 
     40         // 获取全国各个省级信息
     41         Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
     42         Elements rowProvince = connect.select("tr." + cssMap.get(level));
     43         for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
     44         {
     45             Elements select = provinceElement.select("a");
     46             for (Element province : select)// 每一个省份(四川省)
     47             {
     48                 String href = province.attr("href");
     49                 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
     50                 System.out.println(level+1+","+procode+","+province.text());
     51 //                SysZone zone = new SysZone();
     52 //                try {
     53 //                    zone.set("zoneLevel", level+1)
     54 //                            .set("zoneCode", procode.trim())
     55 //                            .set("parentCode", "000000000000")
     56 //                            .set("zoneName", province.text())
     57 //                            .save();
     58 //                } catch (Exception e1) {
     59 //                    // TODO Auto-generated catch block
     60 //                    e1.printStackTrace();
     61 //                    
     62 //                }
     63                 
     64                 parseNextLevel(province, level + 1,procode);
     65             }
     66         }
     67 //        for (int i = 3; i < rowProvince.size(); i++) {
     68 //            Element provinceElement = rowProvince.get(i);
     69 //            Elements select = provinceElement.select("a");
     70 //            for (int j = 2; j < select.size(); j++) {
     71 //                Element province = select.get(j);
     72 //                System.out.println(province.text());
     73 //                parseNextLevel(province, level + 1);
     74 //            }
     75 //        }
     76     }
     77     
     78     @Test
     79     public void testa(){
     80         // 获取全国各个省级信息
     81         Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/");
     82         Elements rowProvince = connect.select("tr." + cssMap.get(1));
     83         for (Element provinceElement : rowProvince)// 遍历每一行的省份城市
     84         {
     85             Elements select = provinceElement.select("a");
     86             for (Element province : select)// 每一个省份(四川省)
     87             {
     88                 printProvince(province);
     89             }
     90         }
     91     }
     92 
     93     private static void parseNextLevel(Element parentElement, int level, String parent)
     94             throws IOException {
     95         try {
     96             Thread.sleep(500);
     97         } catch (InterruptedException e) {
     98             e.printStackTrace();
     99         }
    100 
    101         Document doc = connect(parentElement.attr("abs:href"));
    102         if(doc==null){
    103             doc = connect(parentElement.attr("abs:href"));
    104         }
    105         Elements newsHeadlines = doc.select("tr." + cssMap.get(level));//
    106         // 获取表格的一行数据
    107         for (Element element : newsHeadlines) {
    108             String parents = printInfo(element, level + 1,parent);
    109             Elements select = element.select("a");
    110             if (select.size() != 0) {
    111                 parseNextLevel(select.last(), level + 1,parents);
    112             }
    113         }
    114     }
    115 
    116     private static void printProvince(Element province){
    117         BufferedWriter bufferedWriter = null;
    118         try {
    119             bufferedWriter = new BufferedWriter(new FileWriter(new File(
    120                     "F:\provinces.txt"), true));
    121             String pro =  province.text();
    122             String href = province.attr("href");
    123             String procode = href.substring(href.length()-7, href.length()-5)+"0000000000";
    124             System.out.println(2+","+procode+",000000000000,"+pro);
    125             bufferedWriter.write(2+","+procode+",000000000000,"+pro);
    126             bufferedWriter.newLine();
    127             bufferedWriter.flush();
    128         } catch (IOException e) {
    129             e.printStackTrace();
    130         } finally {
    131             if (bufferedWriter != null) {
    132                 try {
    133                     bufferedWriter.close();
    134                 } catch (IOException e) {
    135                     e.printStackTrace();
    136                 }
    137                 bufferedWriter = null;
    138             }
    139         }
    140     }
    141     
    142     private static  String printInfo(Element element, int level,String parent) {
    143         BufferedWriter bufferedWriter = null;
    144         String code = "";
    145         code = element.select("td").first().text();
    146         String name =element.select("td").last().text();
    147         String str =  level + "," + code + ","+parent+","
    148                         + name;
    149     
    150         System.out.println(str);
    151 //        SysZone zone = new SysZone();
    152 //        try {
    153 //            zone.set("zoneLevel", level)
    154 //                    .set("zoneCode", code.trim())
    155 //                    .set("parentCode", parent.trim())
    156 //                    .set("zoneName", name.trim())
    157 //                    .save();
    158 //        } catch (Exception e1) {
    159 //            // TODO Auto-generated catch block
    160 //            e1.printStackTrace();
    161 //            
    162 //        }
    163         try {
    164             bufferedWriter = new BufferedWriter(new FileWriter(new File(
    165                     "F:\AllCity.txt"), true));
    166             
    167             bufferedWriter.write(str);
    168             bufferedWriter.newLine();
    169             bufferedWriter.flush();
    170         } catch (IOException e) {
    171             e.printStackTrace();
    172         } finally {
    173             if (bufferedWriter != null) {
    174                 try {
    175                     bufferedWriter.close();
    176                 } catch (IOException e) {
    177                     e.printStackTrace();
    178                 }
    179                 bufferedWriter = null;
    180             }
    181         }
    182         
    183         return  code;
    184     }
    185 
    186     private static Document connect(String url) {
    187         if (url == null || url.isEmpty()) {
    188             throw new IllegalArgumentException("The input url('" + url
    189                     + "') is invalid!");
    190         }
    191         String [] b = {
    192                    "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
    193                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    194                    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    195                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    196                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    197                    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    198                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    199                    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    200                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    201                    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    202                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    203                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    204                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    205                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    206                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    207                    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    208                    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    209                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    210                    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    211     };
    212         try {
    213             
    214             Random rand = new Random();
    215             return Jsoup.connect(url) 
    216             .header("User-Agent",b[rand.nextInt(19)])
    217             .timeout(90 * 1000).get();
    218         } catch (IOException e) {
    219             e.printStackTrace();
    220         }
    221         return null;
    222     }
    223 }
    View Code

    2.可以选择从文本读取后写入数据库,也可直接写入。

    3.最终,我生成了两份,一个是省市县三级的,一个是所有的。

    省市县中去掉了市辖区等无关代码。

    4.爬虫源码:包括数据库保存。下载源码

    5.省市县t:下载txt,  下载sql

    6.所有地区,港澳台除外txt,  下载sql

  • 相关阅读:
    105.UDP通信实现广播
    104.tcp多线程读写实现群聊
    103.tcp通信实现远程控制
    102.tcp实现多线程连接与群聊
    101.自动注入
    100.dll调用
    99.遍历进程并直接写入内存
    98.TCP通信传输文件
    97.TCP通信
    96.udp通信
  • 原文地址:https://www.cnblogs.com/woshimrf/p/4943382.html
Copyright © 2020-2023  润新知