• 爬一下国家统计局行政区划代码C#


    目前NBS上有2015-2018四个年度的代码信息,写一个控制台程序爬一下县级行政区下的代码。

    使用HttpWebRequest+HttpWebResponse获取html,使用HtmlAgilityPack类库解析HTML。

    使用POST请求,请求头带Cookie信息,否则会被反爬机制挡死,返回“请开启JavaScript并刷新该页”。

    县级URL Request获取数据的同时记录Response的Cookie信息,在请求镇级数据时,请求头发送此cookie。

    省-地-县-乡 ”与“省-县(地)-乡” 的URL长度不同,根据长度判断URL正确性时需注意,也许还有其他可能,暂未发现。

    主方法

      1  class Program
      2     {
      3         static void Main(string[] args)
      4         {
      5             Console.ForegroundColor = ConsoleColor.Magenta;
      6             Console.WriteLine("
    ----获取县级行政区乡、村二级区划代码");
      7             Console.WriteLine("----数据年份有:");
      8             Console.ResetColor();
      9             Cursor.WriteAt("A、2018", 2, 0);
     10             Cursor.WriteAt("B、2017", 12, 0);
     11             Cursor.WriteAt("C、2016", 2, 1);
     12             Cursor.WriteAt("D、2015", 12, 1);
     13             Input: Console.ForegroundColor = ConsoleColor.Magenta;
     14             Console.WriteLine();
     15             Console.WriteLine("----请输入一个年份代码(回车提交):");
     16             Console.ResetColor();
     17             char chr = Convert.ToChar( Console.ReadLine().ToLower()[0]);
     18             if ((int)chr >= 97 &&(int)chr <= 100)
     19             {
     20                 string year = string.Empty;
     21                 switch (chr)
     22                 {
     23                     case 'a':
     24                         year = "2018"; break;
     25                     case 'b':
     26                         year = "2017"; break;
     27                     case 'c':
     28                         year = "2016"; break;
     29                     default:
     30                         year = "2015"; break;
     31                 }
     32                 System.Diagnostics.Process.Start($"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{year}");
     33                 Console.ForegroundColor = ConsoleColor.Magenta;
     34                 Console.WriteLine("浏览器已加载区划代码起始页,请进入县级行政单位页面,复制url,粘贴到下面(回车提交):");
     35             }
     36             else
     37                 goto Input;
     38             Console.ResetColor();
     39             string cityurl = Console.ReadLine();
     40             if (cityurl.Length != 66&& cityurl.Length!=71)
     41             {
     42                 Console.ForegroundColor = ConsoleColor.Magenta;
     43                 Console.WriteLine("url有误,请确认是县级行政单位页面,重新复制链接,粘贴到下面:");
     44                 Console.ResetColor();
     45                 cityurl = Console.ReadLine();
     46             }
     47             try
     48             {
     49                 Console.ForegroundColor = ConsoleColor.Magenta;
     50                 Func<object, List<TownInfo>> func = new Func<object, List<TownInfo>>(GetTownInfos);
     51                 Task<List<TownInfo>> task = new Task<List<TownInfo>>(func, cityurl);
     52                 task.Start();
     53                 task.Wait();
     54                 if (task.Status == TaskStatus.RanToCompletion && task.Result.Count > 0)
     55                 {
     56 
     57                     List<VillageInfo> villageInfos = new List<VillageInfo>();
     58                     foreach (var item in task.Result)
     59                     {
     60                         //把乡镇信息写入村级列表,实现乡镇信息输出
     61                         VillageInfo villageInfo_town = new VillageInfo(item.Code, "", item.Name);
     62                         villageInfos.Add(villageInfo_town);
     63                         Func<object, List<VillageInfo>> func1 = new Func<object, List<VillageInfo>>(GetVillageInfos);
     64                         Task<List<VillageInfo>> task1 = new Task<List<VillageInfo>>(func1, item.Href);
     65                         task1.Start();
     66                         task1.Wait();
     67                         if (task1.Status == TaskStatus.RanToCompletion)
     68                         {
     69                             villageInfos.AddRange(task1.Result);
     70                         }
     71                     }
     72                     foreach (var item1 in villageInfos)
     73                     {
     74                         Console.WriteLine($"{item1.Name.Trim()}	{item1.Cls.Trim()}	{item1.Code.Trim()}");
     75                     }
     76                 }
     77                 else
     78                 { Console.WriteLine("乡镇列表获取失败!"); }
     79 
     80             }
     81             catch (Exception)
     82             {
     83                 throw new Exception("");
     84             }
     85             Console.ReadKey();
     86         }
     87         static string cookies = "AD_RS_COOKIE=20082854; wzws_cid=453a2d88181321410de83ba7eedaba3a141eb61ee7488027b6ab07a66054605e99e886827afa72708ce170398ea2fdfeec55455a7c0be8e779694026255f2166";
     88         //获取乡镇级信息列表
     89         static List<TownInfo> GetTownInfos(object cityurl)
     90         {
     91             List<TownInfo> townInfos = new List<TownInfo>();
     92             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url =(string) cityurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936),RequestMethod="post"};
     93             //HtmlAgilityPack类库解析HTML
     94             HtmlDocument document = new HtmlDocument();
     95             document.LoadHtml(httpGetHelper.GetHtml(1,ref cookies));
     96             //string html = httpGetHelper.GetHtml(ref cookies);
     97             //路径里"//"表示从根节点开始查找,两个斜杠‘//’表示查找所有childnodes;一个斜杠'/'表示只查找第一层的childnodes(即不查找grandchild);点斜杠"./"表示从当前结点而不是根结点开始查找
     98             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='towntr']");
     99             foreach (var node in htmlNodes)
    100             {
    101                 HtmlNodeCollection htmlNodes1 = node.SelectNodes("./td");
    102                 HtmlNode htmlNodeHref = node.SelectSingleNode(".//a[@href]");
    103                 HtmlAttribute htmlAttribute = htmlNodeHref.Attributes["href"];
    104                 TownInfo townInfo = new TownInfo(htmlNodes1[0].InnerText, htmlNodes1[1].InnerText,
    105                     (cityurl as string).Substring(0, (cityurl as string).LastIndexOf('/') + 1) + htmlAttribute.Value);
    106                 townInfos.Add(townInfo);
    107             }
    108             return townInfos;
    109         }
    110         //获取村级信息列表
    111         static List<VillageInfo> GetVillageInfos(object townurl)
    112         {
    113             List<VillageInfo> villageInfos = new List<VillageInfo>();
    114             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url = (string)townurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936), RequestMethod = "post"};
    115             HtmlDocument document = new HtmlDocument();
    116             document.LoadHtml(httpGetHelper.GetHtml(2,ref cookies));
    117             //string html = httpGetHelper.GetHtml(ref cookies);
    118             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='villagetr']");
    119             foreach (var node in htmlNodes)
    120             {
    121                 HtmlNodeCollection htmlNodes1 = node.SelectNodes(".//td");
    122                 VillageInfo villageInfo = new VillageInfo(htmlNodes1[0].InnerText,htmlNodes1[1].InnerText,htmlNodes1[2].InnerText);
    123                 villageInfos.Add(villageInfo);
    124             }
    125             return villageInfos;
    126         }
    127     }

    辅助类/结构

     1   internal class Cursor
     2     {
     3         const int origRow = 3;
     4         const int origCol = 0;
     5         public static void WriteAt(string s, int c, int r)
     6         {
     7             Console.SetCursorPosition(origCol + c, origRow + r);
     8             Console.Write(s);
     9         }
    10     }
    11     //乡镇信息结构 编码、名称、超链
    12     struct TownInfo
    13     {
    14         string code;
    15         public string Code{ get { return code; } }
    16         string name;
    17         public string Name{get { return name; } }
    18         string href;
    19         public string Href { get { return href; } }
    20         public TownInfo (string code,string name,string href)
    21         {
    22             this.code = code;
    23             this.name = name;
    24             this.href = href;
    25         }
    26     }
    27     //村信息结构 编码、城乡划分类,名称
    28     struct VillageInfo
    29     {
    30         string code;
    31         public string Code{ get { return code; } }
    32         string  cls;
    33         public string Cls{ get { return cls; } }
    34         string name;
    35         public string Name{ get { return name; } }
    36         public VillageInfo(string code,string cls,string name)
    37         {
    38             this.code = code;
    39             this.cls = cls;
    40             this.name = name;
    41         }
    42     }

    获取HTML

     1     public class HttpGetHelper
     2     {
     3         string url = string.Empty;
     4         public string Url
     5         {
     6             set { url = value; }
     7         }
     8 
     9         int timeOut=10*1000;
    10         public int Timeout
    11         {
    12             set { timeOut = value; }
    13         }
    14 
    15         string contentType= "text/html;charset=utf-8";
    16         public string ContentType
    17         {
    18             set { contentType = value; }
    19         }
    20 
    21         string userAgent= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 ";
    22         public string UserAgent
    23         {
    24             set { userAgent = value; }
    25         }
    26 
    27         Encoding encode=Encoding.UTF8;
    28         public Encoding Encode
    29         {
    30             set { encode = value; }
    31         }
    32         string request_Method = "get";
    33         public string RequestMethod
    34         {
    35             set { request_Method = value; }
    36         }
    37         /// <summary>
    38         /// get html content
    39         /// </summary>
    40         /// <param name="cls">town=1;village=2</param>
    41         /// <param name="cookies">if cls=1 then ref cookies</param>
    42         /// <returns></returns>
    43         public string GetHtml(int cls,ref string cookies)
    44         {
    45             string html = string.Empty;
    46             try
    47             {
    48                 if (url!=string.Empty)
    49                 {
    50                     HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
    51                     request.Timeout = this.timeOut;
    52                     request.ContentType = this.contentType;
    53                     request.UserAgent = this.userAgent;
    54                     request.Headers.Add(HttpRequestHeader.Cookie, cookies);
    55                     request.Method = request_Method;
    56                     using (HttpWebResponse response =request.GetResponse()as HttpWebResponse)
    57                     {
    58                         if (response.StatusCode==HttpStatusCode.OK)
    59                         {//如果是县级url,则记录cookie
    60                             if (cls==1)
    61                             {
    62                                 CookieCollection cookieCollection = response.Cookies;
    63                                 foreach (Cookie item in cookieCollection)
    64                                 {
    65                                     cookies = item.Name + "=" + item.Value + ";";
    66                                 }
    67                                 cookies.Remove(cookies.Length - 1);
    68                             }
    69 
    70                             using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), encode))
    71                             {
    72                                 html = streamReader.ReadToEnd();
    73                                 streamReader.Close();
    74                             }
    75                         }
    76                     }
    77                 }
    78             }
    79             catch (Exception)
    80             {
    81                 throw new Exception($"GetHtml失败,url:{url}");
    82             }
    83             return html;
    84         }
    85     }
  • 相关阅读:
    json学习系列(1)-使用json所要用到的jar包下载
    Java 时间架构图
    时间纪元与时区介绍
    HTML5 Canvas 绘制库存变化折线
    HTML5 Canvas 笛卡尔坐标系转换尝试
    像孩童一样欣喜的看着自己的成长
    《老炮儿》结尾貌似历史上的一幕
    很多人还在守着金饭碗要饭
    还是用文本编辑器编程让人愉悦
    Node.js 网页爬虫再进阶,cheerio助力
  • 原文地址:https://www.cnblogs.com/yzhyingcool/p/10705889.html
Copyright © 2020-2023  润新知