这两天,编码做了一个新蛋网手机信息的采集,web页面信息采集是用WebClient控件。需要调用方法Gather()。希望能有帮助。
代码如下:
/* * Created By ChinaAgan 2012-1-18 * */ using System; using System.Collections.Generic; using System.Text; using System.Collections; using System.Net; using System.IO; using System.Text.RegularExpressions; using CnBlogCollector.Properties; namespace CnBlogCollector { /// <summary> /// 数据采集类 /// </summary> public class Collector { #region 变量 private string cnblogMain = "http://www.newegg.com.cn/SubCategory/1043-{0}.htm";//cnblog首页地址 private WebClient wc = new WebClient(); #endregion #region 创建目录 /// <summary> /// 判断目录是否存在,若不存在则创建该目录 /// </summary> /// <param name="path"></param> /// <returns></returns> public string CreateFolderIfNot(string path) { //获取该目录的完整路径 string rtn = Path.GetFullPath(path); //若该目录不存在 if (!Directory.Exists(rtn)) { //创建该目录 Directory.CreateDirectory(rtn); } return rtn; } #endregion #region 采集网页数据 public void Gather(int startIndex, int endIndex) { WebProxy webProxy = new WebProxy("proxy.cn1.global.***.com:8080"); webProxy.Credentials = new System.Net.NetworkCredential("user", "password"); wc.Proxy = webProxy; string outContent = ""; //根据startIndex和endIndex来遍历cnblog首页上文章 for (int i = startIndex; i < endIndex; i++) { //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING string url = string.Format(cnblogMain, i.ToString()); string mainData = Encoding.GetEncoding("GB2312").GetString(wc.DownloadData(url)).Replace("\r\n", ""); string strPattern = @"<p\s+class=""info""><a\s+href=(?<url>.+?)\s+title=""(?<title>.+?)"">(?<content>.+?)</a>"; string oldPricePattern = @"<p\s+class=""bypast""><span>¥(?<OldPrice>.+?)</span></p>"; string newPricePattern = @"<p\s+class=""current""><strong\s+class=""price""><span>¥</span>(?<NewPrice>\d+?\..+?)</strong></p>"; List<string> nameList = new List<string>(); List<string> oldPriceList = new List<string>(); List<string> newPriceList = new List<string>(); string oldPrice = String.Empty; string newPrice = String.Empty; MatchCollection MatchesName = Regex.Matches(mainData, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); MatchCollection MatchesOldPrice = Regex.Matches(mainData, oldPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); MatchCollection MatchesNewPrice = Regex.Matches(mainData, newPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); foreach (Match NextMatch in MatchesName) { nameList.Add(NextMatch.Groups["content"].Value); } foreach (Match NextMatch in MatchesOldPrice) { oldPriceList.Add(NextMatch.Groups["OldPrice"].Value); } foreach (Match NextMatch in MatchesNewPrice) { newPriceList.Add(NextMatch.Groups["NewPrice"].Value); } for (int iLen = 0; iLen < nameList.Count; iLen++) { outContent += String.Format("手机名称:{0}," + "原价:{1},现价:{2}", nameList[iLen].ToString(), oldPriceList[iLen].ToString(), newPriceList[iLen].ToString()) +"\r\n"; } // 现价和&32;之类符号的处理。 string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + ".txt"; if (File.Exists(pth)) { File.Delete(pth); } File.AppendAllText(pth, outContent, Encoding.GetEncoding("GB2312")); outContent = ""; } } #endregion } }