发现地区编码网上流传了很多版本。有很多崇文区,玄武区之类的。于是想了想,还是自己做一份。不敢保证没问题,但还没遇到问题。
首先,从网上找到一个大神写的jsoup的例子,修改成自己想要的格式,在代码无价的年代,原谅我的抄袭,研究是份任重而道远的任务。
1.jsoup代码:
1 package com.test; 2 3 import java.io.BufferedWriter; 4 import java.io.File; 5 import java.io.FileWriter; 6 import java.io.IOException; 7 import java.util.HashMap; 8 import java.util.Map; 9 import java.util.Random; 10 11 import org.apache.log4j.Logger; 12 import org.jsoup.Jsoup; 13 import org.jsoup.nodes.Document; 14 import org.jsoup.nodes.Element; 15 import org.jsoup.select.Elements; 16 import org.junit.Test; 17 18 /** 19 * 全国省市县镇村数据爬取 20 * 21 * @author liushaofeng 22 * @date 2015-10-11 上午12:19:39 23 * @version 1.0.0 24 */ 25 public class JsoupProviceTest { 26 private static Map<Integer, String> cssMap = new HashMap<Integer, String>(); 27 28 static { 29 cssMap.put(1, "provincetr");// 省 30 cssMap.put(2, "citytr");// 市 31 cssMap.put(3, "countytr");// 县 32 cssMap.put(4, "towntr");// 镇 33 cssMap.put(5, "villagetr");// 村 34 } 35 36 public static void main(String[] args) throws IOException { 37 int level = 1; 38 // TestConDataBase.initDataBase(); 39 40 // 获取全国各个省级信息 41 Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"); 42 Elements rowProvince = connect.select("tr." + cssMap.get(level)); 43 for (Element provinceElement : rowProvince)// 遍历每一行的省份城市 44 { 45 Elements select = provinceElement.select("a"); 46 for (Element province : select)// 每一个省份(四川省) 47 { 48 String href = province.attr("href"); 49 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000"; 50 System.out.println(level+1+","+procode+","+province.text()); 51 // SysZone zone = new SysZone(); 52 // try { 53 // zone.set("zoneLevel", level+1) 54 // .set("zoneCode", procode.trim()) 55 // .set("parentCode", "000000000000") 56 // .set("zoneName", province.text()) 57 // .save(); 58 // } catch (Exception e1) { 59 // // TODO Auto-generated catch block 60 // e1.printStackTrace(); 61 // 62 // } 63 64 parseNextLevel(province, level + 1,procode); 65 } 66 } 67 // for (int i = 3; i < rowProvince.size(); i++) { 68 // Element provinceElement = rowProvince.get(i); 69 // Elements select = provinceElement.select("a"); 70 // for (int j = 2; j < select.size(); j++) { 71 // Element province = select.get(j); 72 // System.out.println(province.text()); 73 // parseNextLevel(province, level + 1); 74 // } 75 // } 76 } 77 78 @Test 79 public void testa(){ 80 // 获取全国各个省级信息 81 Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"); 82 Elements rowProvince = connect.select("tr." + cssMap.get(1)); 83 for (Element provinceElement : rowProvince)// 遍历每一行的省份城市 84 { 85 Elements select = provinceElement.select("a"); 86 for (Element province : select)// 每一个省份(四川省) 87 { 88 printProvince(province); 89 } 90 } 91 } 92 93 private static void parseNextLevel(Element parentElement, int level, String parent) 94 throws IOException { 95 try { 96 Thread.sleep(500); 97 } catch (InterruptedException e) { 98 e.printStackTrace(); 99 } 100 101 Document doc = connect(parentElement.attr("abs:href")); 102 if(doc==null){ 103 doc = connect(parentElement.attr("abs:href")); 104 } 105 Elements newsHeadlines = doc.select("tr." + cssMap.get(level));// 106 // 获取表格的一行数据 107 for (Element element : newsHeadlines) { 108 String parents = printInfo(element, level + 1,parent); 109 Elements select = element.select("a"); 110 if (select.size() != 0) { 111 parseNextLevel(select.last(), level + 1,parents); 112 } 113 } 114 } 115 116 private static void printProvince(Element province){ 117 BufferedWriter bufferedWriter = null; 118 try { 119 bufferedWriter = new BufferedWriter(new FileWriter(new File( 120 "F:\provinces.txt"), true)); 121 String pro = province.text(); 122 String href = province.attr("href"); 123 String procode = href.substring(href.length()-7, href.length()-5)+"0000000000"; 124 System.out.println(2+","+procode+",000000000000,"+pro); 125 bufferedWriter.write(2+","+procode+",000000000000,"+pro); 126 bufferedWriter.newLine(); 127 bufferedWriter.flush(); 128 } catch (IOException e) { 129 e.printStackTrace(); 130 } finally { 131 if (bufferedWriter != null) { 132 try { 133 bufferedWriter.close(); 134 } catch (IOException e) { 135 e.printStackTrace(); 136 } 137 bufferedWriter = null; 138 } 139 } 140 } 141 142 private static String printInfo(Element element, int level,String parent) { 143 BufferedWriter bufferedWriter = null; 144 String code = ""; 145 code = element.select("td").first().text(); 146 String name =element.select("td").last().text(); 147 String str = level + "," + code + ","+parent+"," 148 + name; 149 150 System.out.println(str); 151 // SysZone zone = new SysZone(); 152 // try { 153 // zone.set("zoneLevel", level) 154 // .set("zoneCode", code.trim()) 155 // .set("parentCode", parent.trim()) 156 // .set("zoneName", name.trim()) 157 // .save(); 158 // } catch (Exception e1) { 159 // // TODO Auto-generated catch block 160 // e1.printStackTrace(); 161 // 162 // } 163 try { 164 bufferedWriter = new BufferedWriter(new FileWriter(new File( 165 "F:\AllCity.txt"), true)); 166 167 bufferedWriter.write(str); 168 bufferedWriter.newLine(); 169 bufferedWriter.flush(); 170 } catch (IOException e) { 171 e.printStackTrace(); 172 } finally { 173 if (bufferedWriter != null) { 174 try { 175 bufferedWriter.close(); 176 } catch (IOException e) { 177 e.printStackTrace(); 178 } 179 bufferedWriter = null; 180 } 181 } 182 183 return code; 184 } 185 186 private static Document connect(String url) { 187 if (url == null || url.isEmpty()) { 188 throw new IllegalArgumentException("The input url('" + url 189 + "') is invalid!"); 190 } 191 String [] b = { 192 "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3", 193 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 194 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 195 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 196 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 197 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 198 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 199 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 200 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 201 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 202 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 203 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 204 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 205 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 206 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 207 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 208 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 209 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 210 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 211 }; 212 try { 213 214 Random rand = new Random(); 215 return Jsoup.connect(url) 216 .header("User-Agent",b[rand.nextInt(19)]) 217 .timeout(90 * 1000).get(); 218 } catch (IOException e) { 219 e.printStackTrace(); 220 } 221 return null; 222 } 223 }
2.可以选择从文本读取后写入数据库,也可直接写入。
3.最终,我生成了两份,一个是省市县三级的,一个是所有的。
省市县中去掉了市辖区等无关代码。
4.爬虫源码:包括数据库保存。下载源码
6.所有地区,港澳台除外txt, 下载sql