需求:完成一个城市和区号的xml配置文件
处理思路:通过HtmlAgilityPack解析一个区号页面,生产xml文件
页面:http://www.hljboli.gov.cn/html/code.html
代码:
1 public void LoadUrl(string url) 2 { 3 4 try 5 { 6 //Tab1 7 this.webBrowser1.Url = new Uri(url); 8 9 HtmlWeb web = new HtmlWeb(); 10 //不加这行中文会乱码 11 web.OverrideEncoding = Encoding.Default; 12 HtmlAgilityPack.HtmlDocument doc = web.Load(url); 13 //通过反射获取 internal 字段值 14 FieldInfo info = doc.GetType().GetField("Text", BindingFlags.Instance | BindingFlags.NonPublic); 15 var text = info.GetValue(doc).ToString(); 16 //Tab2 17 this.richTextBox1.Text = text; 18 19 var sb = new StringBuilder(); 20 sb.Append("<?xml version="1.0" encoding="utf-8" ?>").AppendLine(); 21 sb.Append("<codes>").AppendLine(); 22 23 //xpath表达式 24 var provinceNodes = doc.DocumentNode.SelectNodes("//tr"); 25 26 //过滤重复的区号 27 var tempList = new List<string>(); 28 foreach (var provinceNode in provinceNodes) 29 { 30 //xpath表达式 31 var cityNodes = provinceNode.SelectNodes("td"); 32 33 if (cityNodes.Count == 1) 34 { 35 if (cityNodes[0].InnerText != " ") //去除空白行 36 sb.AppendFormat("<!--{0}-->", cityNodes[0].InnerText).AppendLine(); 37 } 38 if (cityNodes.Count == 6) 39 { 40 if (cityNodes[1].InnerText != "长途区号") 41 { 42 if (!tempList.Contains(cityNodes[1].InnerText)) 43 { 44 sb.AppendFormat("<code name="{0}" value="{1}" />", cityNodes[0].InnerText, cityNodes[1].InnerText).AppendLine(); 45 tempList.Add(cityNodes[1].InnerText); 46 } 47 48 if (cityNodes[3].InnerText != " " && !tempList.Contains(cityNodes[4].InnerText))//去除行不满情况 49 { 50 sb.AppendFormat("<code name="{0}" value="{1}" />", cityNodes[3].InnerText, cityNodes[4].InnerText).AppendLine(); 51 tempList.Add(cityNodes[4].InnerText); 52 } 53 } 54 } 55 } 56 sb.Append("</codes>").AppendLine(); 57 58 this.richTextBox2.Text = sb.ToString(); 59 60 } 61 catch (Exception) 62 { 63 64 65 } 66 67 68 }
注意:
1.中文乱码,需要设置 web.OverrideEncoding = Encoding.Default;
2.通过反射访问HtmlDocument的内置字段Text获取body的内容
FieldInfo info = doc.GetType().GetField("Text", BindingFlags.Instance | BindingFlags.NonPublic);
var text = info.GetValue(doc).ToString();
3.通过xpath表达式获取节点元素