这两天在做数据采集,因此整理了下数据采集要用到的一些方法。因为我采集的数据比较简单,所以没有用到框架。比较有名的两个框架 HtmlAgilityPack 和 Jumony,感兴趣的可以研究下。当然,火车头采集工具也很方便,不过要付费。下面是整理的代码:
/// <summary> /// Html正则处理帮助类 /// </summary> public class HtmlRegex { /// <summary> /// 匹配所有Html标签 /// </summary> const string HTMLALLTAG = @"<[^>]+>|</[^>]+>"; /// <summary> /// 删除所有html标签 /// </summary> /// <param name="content">原HTML代码</param> /// <returns></returns> public static string RemoveAllHtml(string content) { return Regex.Replace(content, HTMLALLTAG, ""); } /// <summary> /// 根据正则匹配获取指定内容 /// </summary> /// <param name="regStr">正则</param> /// <param name="content">原HTML代码</param> /// <param name="hashtml">是否包含HTML标签</param> /// <returns></returns> public static string GetStrByRegex(string regStr, string content, bool hashtml = true) { string result = string.Empty; Regex reg = new Regex(regStr); Match mth = reg.Match(content); if (mth.Success) { result = mth.Value; if (!hashtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标签 } return result; } /// <summary> /// 获取指定位置的html代码 /// </summary> /// <param name="start">起始字符串</param> /// <param name="end">结束字符串</param> /// <param name="content">原HTML代码</param> /// <param name="hasHtml">是否包含HTML标签</param> /// <returns></returns> public static string GetStrByRegex(string start, string end, string content, bool hasHtml = true) { string result = string.Empty; string regStr = @"(?is)(" + start + ").*?(" + end + ")"; Regex reg = new Regex(regStr); Match mth = reg.Match(content); if (mth.Success) { result = mth.Value; if (!hasHtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标签 } return result; } /// <summary> /// 获取匹配的字符串列表 /// </summary> /// <param name="regStr">正则</param> /// <param name="content">原HTML代码</param> /// <returns></returns> public static List<string> GetStrListByRegex(string regStr, string content) { List<string> strList = null; MatchCollection mc = null; try { Regex reg = new Regex(regStr); mc = reg.Matches(content); if (mc.Count > 0) { strList = new List<string>(); for (int i = 0; i < mc.Count; i++) { strList.Add(mc[i].Value); } } } catch { strList = null; } return strList; } /// <summary> /// 获取匹配的字符串列表 /// </summary> /// <param name="start">起始字符串</param> /// <param name="end">结束字符串</param> /// <param name="content">原HTML代码</param> /// <returns></returns> public static List<string> GetStrListByRegex(string start, string end, string content) { List<string> strList = null; MatchCollection mc = null; string regStr = @"(?is)(" + start + ").*?(" + end + ")"; try { Regex reg = new Regex(regStr); mc = reg.Matches(content); if (mc.Count > 0) { strList = new List<string>(); for (int i = 0; i < mc.Count; i++) { strList.Add(mc[i].Value); } } } catch { strList = null; } return strList; } }