在统计局官网提供了每年最新的PAC代码,方便大家查询,但没有提供完整版的下载,于是"手工"把它复制下来了。
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
此工具有两个关键点:
1、Get函数中要注意编码问题,要去读取它是什么编码方式,否则可能是乱码;由于网速或服务器等原因,每一次请求时可以休眠100毫秒,每当出现404或服务器中断等情况时,线程暂停2秒再试(目前第二次Get都正常);
2、第二个关键点是年份、省、市、县、乡、村的解析,当然是正则表达式来处理。
下面是HTTP请求代码示例:
public static string RequestGet(string strUrl, string level="") { strUrl = strUrl.Replace(" ", ""); string outString=""; try { HttpWebResponse httpWebResponse = null; if (!Regex.IsMatch(strUrl, @"^https?://", RegexOptions.IgnoreCase)) strUrl = "http://" + strUrl; if (WebRequest.Create(strUrl) is HttpWebRequest httpWebRequest) { httpWebRequest.Timeout = timeOut; httpWebRequest.ReadWriteTimeout = 60000; httpWebRequest.AllowAutoRedirect = true; httpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; Maxthon 2.0)"; httpWebResponse = (HttpWebResponse) httpWebRequest.GetResponse(); } if (httpWebResponse != null && httpWebResponse.ContentType.ToLower().IndexOf("text/html", StringComparison.Ordinal) == -1) { httpWebResponse.Close(); return string.Empty; } using (Stream stream = httpWebResponse.GetResponseStream()) { List<byte> lst = new List<byte>(); int nRead = 0; while ((nRead = stream.ReadByte()) != -1) lst.Add((byte)nRead); byte[] byHtml = lst.ToArray(); outString = Encoding.UTF8.GetString(byHtml, 0, byHtml.Length); string strCharSet = Regex.Match(outString, @"<meta.*?charset=""?([a-z0-9-]+)", RegexOptions.IgnoreCase).Groups[1].Value; if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1)) { outString = Encoding.GetEncoding(strCharSet).GetString(byHtml, 0, byHtml.Length); } } return outString; } catch (Exception exception1) { Console.WriteLine(@"-------------------------------"); ErrorUrl.Add(strUrl); Console.WriteLine($@"首次获取失败:{ exception1.Message},{strUrl}"); //暂停2秒 Thread.Sleep(2000); try { outString= RequestGet(strUrl, level); ErrorUrl.Remove(strUrl); Console.WriteLine(@"暂停2秒后成功。"); } catch (Exception exception2) { //暂停5秒 Thread.Sleep(5000); Console.WriteLine($@"再次获取失败:{ exception2.Message},{strUrl}"); try { outString= RequestGet(strUrl, level); ErrorUrl.Remove(strUrl); Console.WriteLine(@"暂停5秒后成功。"); } catch (Exception exception3) { Console.WriteLine($@"放弃获取:{ exception3.Message},{strUrl}"); outString = ""; } } return outString; } }