项目总览
1,log4net
1 <?xml version="1.0" encoding="utf-8"?> 2 <log4net> 3 <!-- Define some output appenders --> 4 <appender name="rollingAppender" type="log4net.Appender.RollingFileAppender"> 5 <file value="loglog.txt" /> 6 7 <!--追加日志内容--> 8 <appendToFile value="true" /> 9 10 <!--防止多线程时不能写Log,官方说线程非安全--> 11 <lockingModel type="log4net.Appender.FileAppender+MinimalLock" /> 12 13 <!--可以为:Once|Size|Date|Composite--> 14 <!--Composite为Size和Date的组合--> 15 <rollingStyle value="Composite" /> 16 17 <!--当备份文件时,为文件名加的后缀--> 18 <datePattern value="yyyyMMdd.TXT" /> 19 20 <!--日志最大个数,都是最新的--> 21 <!--rollingStyle节点为Size时,只能有value个日志--> 22 <!--rollingStyle节点为Composite时,每天有value个日志--> 23 <maxSizeRollBackups value="20" /> 24 25 <!--可用的单位:KB|MB|GB--> 26 <maximumFileSize value="3MB" /> 27 28 <!--置为true,当前最新日志文件名永远为file节中的名字--> 29 <staticLogFileName value="true" /> 30 31 <!--输出级别在INFO和ERROR之间的日志--> 32 <filter type="log4net.Filter.LevelRangeFilter"> 33 <param name="LevelMin" value="INFO" /> 34 <param name="LevelMax" value="FATAL" /> 35 </filter> 36 37 <layout type="log4net.Layout.PatternLayout"> 38 <conversionPattern value="%date [%thread] %-5level %logger - %message%newline"/> 39 </layout> 40 </appender> 41 42 <!-- levels: OFF > FATAL > ERROR > WARN > INFO > DEBUG > ALL --> 43 <root> 44 <priority value="ALL"/> 45 <level value="ALL"/> 46 <appender-ref ref="rollingAppender" /> 47 </root> 48 </log4net>
1 public class Logger 2 { 3 static Logger() 4 { 5 XmlConfigurator.Configure(new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "CfgFiles\log4net.cfg.xml"))); 6 ILog Log = LogManager.GetLogger(typeof(Logger)); 7 Log.Info("系统初始化Logger模块"); 8 } 9 10 private ILog loger = null; 11 public Logger(Type type) 12 { 13 loger = LogManager.GetLogger(type); 14 } 15 16 /// <summary> 17 /// Log4日志 18 /// </summary> 19 /// <param name="msg"></param> 20 /// <param name="ex"></param> 21 public void Error(string msg = "出现异常", Exception ex = null) 22 { 23 Console.WriteLine(msg); 24 loger.Error(msg, ex); 25 } 26 27 /// <summary> 28 /// Log4日志 29 /// </summary> 30 /// <param name="msg"></param> 31 public void Warn(string msg) 32 { 33 Console.WriteLine(msg); 34 loger.Warn(msg); 35 } 36 37 /// <summary> 38 /// Log4日志 39 /// </summary> 40 /// <param name="msg"></param> 41 public void Info(string msg) 42 { 43 Console.WriteLine(msg); 44 loger.Info(msg); 45 } 46 47 /// <summary> 48 /// Log4日志 49 /// </summary> 50 /// <param name="msg"></param> 51 public void Debug(string msg ) 52 { 53 Console.WriteLine(msg); 54 loger.Debug(msg); 55 } 56 }
2,系统配置项
1 /// <summary> 2 /// 系统配置项 3 /// </summary> 4 public class Constant 5 { 6 /// <summary> 7 /// 数据文件保存路径 8 /// </summary> 9 public static string DataPath = ConfigurationManager.AppSettings["DataPath"]; 10 /// <summary> 11 /// 京东类别入口 12 /// </summary> 13 public static string JDCategoryUrl = ConfigurationManager.AppSettings["JDCategoryUrl"]; 14 }
app.config
1 <?xml version="1.0" encoding="utf-8" ?> 2 <configuration> 3 <startup> 4 <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" /> 5 </startup> 6 <appSettings> 7 <add key="DataPath" value="D: uanmouonline920170711Advanced9Course16CrawlerRuanmou.CrawlerRuanmou.CrawlerinDebugData"/> 8 <add key="JDCategoryUrl" value="http://www.jd.com/allSort.aspx"/> 9 </appSettings> 10 <connectionStrings> 11 <add name="mvc5" connectionString="Data Source=ElevenPC; Database=advanced9; User ID=sa; Password=Passw0rd; MultipleActiveResultSets=True" providerName="System.Data.SqlClient" /> 12 </connectionStrings> 13 </configuration>
3,Program
1 /// <summary> 2 /// 1 爬虫,爬虫攻防 3 /// 2 下载html 4 /// 3 xpath解析html,获取数据和深度抓取 5 /// 4 不一样的属性和ajax数据的获取 6 /// 5 多线程爬虫 7 /// </summary> 8 class Program 9 { 10 private static Logger logger = new Logger(typeof(Program)); 11 static void Main(string[] args) 12 { 13 try 14 { 15 Console.WriteLine("欢迎来到.net高级班vip课程,今天是Eleven老师为大家带来的爬虫的学习"); 16 17 #region 测试DownloadHtml 18 string html = HttpHelper.DownloadHtml(@"https://list.jd.com/list.html?cat=9987,653,655", Encoding.UTF8); 19 #endregion 20 21 #region 测试获取分类页 22 //string html1 = HttpHelper.DownloadHtml(Constant.JDCategoryUrl, Encoding.UTF8); 23 #endregion 24 25 26 #region 测试抓取商品列表 27 string testCategory = "{"Id":73,"Code":"02f01s01T","ParentCode":"02f01s","Name":"烟机/灶具","Url":"http://list.jd.com/list.html?cat=737,13297,1300","Level":3}"; 28 Category category = JsonConvert.DeserializeObject<Category>(testCategory); 29 ISearch search = new CommoditySearch(category); 30 search.Crawler(); 31 #endregion 32 33 #region 抓取 34 CrawlerCenter.Handler(); 35 #endregion 36 37 } 38 catch (Exception ex) 39 { 40 logger.Error("异常啦,", ex); 41 Console.WriteLine("*****************木有成功**********************"); 42 } 43 Console.ReadLine(); 44 } 45 }
4,HttpHelper
1 /// <summary> 2 /// http://tool.sufeinet.com/HttpHelper.aspx 3 /// </summary> 4 public class HttpHelper 5 { 6 private static Logger logger = new Logger(typeof(HttpHelper)); 7 8 /// <summary> 9 /// 根据url下载内容 之前是GB2312 10 /// </summary> 11 /// <param name="url"></param> 12 /// <returns></returns> 13 public static string DownloadUrl(string url) 14 { 15 return DownloadHtml(url, Encoding.UTF8); 16 } 17 18 /// <summary> 19 /// 下载html 20 /// http://tool.sufeinet.com/HttpHelper.aspx 21 /// HttpWebRequest功能比较丰富,WebClient使用比较简单 22 /// </summary> 23 /// <param name="url"></param> 24 /// <returns></returns> 25 public static string DownloadHtml(string url, Encoding encode) 26 { 27 string html = string.Empty; 28 try 29 { 30 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求 31 request.Timeout = 30 * 1000;//设置30s的超时 32 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"; 33 request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 34 //request.Host = "search.yhd.com"; 35 36 //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search"); 37 38 //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 39 //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch"); 40 //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/"); 41 42 //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312 43 44 using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求 45 { 46 if (response.StatusCode != HttpStatusCode.OK) 47 { 48 logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode)); 49 } 50 else 51 { 52 try 53 { 54 StreamReader sr = new StreamReader(response.GetResponseStream(), encode); 55 html = sr.ReadToEnd();//读取数据 56 sr.Close(); 57 } 58 catch (Exception ex) 59 { 60 logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex); 61 html = null; 62 } 63 } 64 } 65 } 66 catch (System.Net.WebException ex) 67 { 68 if (ex.Message.Equals("远程服务器返回错误: (306)。")) 69 { 70 logger.Error("远程服务器返回错误: (306)。", ex); 71 html = null; 72 } 73 } 74 catch (Exception ex) 75 { 76 logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex); 77 html = null; 78 } 79 return html; 80 } 81 }
5,爬虫
1 public class CrawlerCenter 2 { 3 private static Logger logger = new Logger(typeof(CrawlerCenter)); 4 5 /// <summary> 6 /// 抓取 7 /// </summary> 8 public static void Handler() 9 { 10 Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过"); 11 string input = Console.ReadLine(); 12 if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) 13 { 14 DBInit.InitCategoryTable(); 15 CrawlerCategory(); 16 } 17 else 18 { 19 Console.WriteLine("你选择不初始化类别数据"); 20 } 21 Console.WriteLine("*****************^_^**********************"); 22 23 24 25 Console.WriteLine("请输入Y/N进行商品数据初始化确认! Y 删除全部商品表表然后重新创建,然后抓取商品数据,N(或者其他)跳过"); 26 input = Console.ReadLine(); 27 if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) 28 { 29 DBInit.InitCommodityTable(); 30 CrawlerCommodity(); 31 } 32 Console.WriteLine("*****************^_^**********************"); 33 } 34 35 private static void CrawlerCategory() 36 { 37 Console.WriteLine($"{ DateTime.Now} jd商品类别开始抓取 - -"); 38 ISearch search = new CategorySearch(); 39 search.Crawler(); 40 } 41 42 /// <summary> 43 /// 抓取商品 44 /// </summary> 45 private static void CrawlerCommodity() 46 { 47 Console.WriteLine($"{ DateTime.Now} jd商品开始抓取 - -"); 48 CategoryRepository categoryRepository = new CategoryRepository(); 49 List<Category> categoryList = categoryRepository.QueryListByLevel(3); 50 51 List<Task> taskList = new List<Task>(); 52 TaskFactory taskFactory = new TaskFactory(); 53 foreach (Category category in categoryList) 54 { 55 ISearch searcher = new CommoditySearch(category); 56 //searcher.Crawler(); 57 taskList.Add(taskFactory.StartNew(searcher.Crawler)); 58 if (taskList.Count > 15) 59 { 60 taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList(); 61 Task.WaitAny(taskList.ToArray()); 62 } 63 } 64 Task.WaitAll(taskList.ToArray()); 65 Console.WriteLine($"{ DateTime.Now} jd商品抓取全部完成 - -"); 66 CleanAll(); 67 } 68 69 /// <summary> 70 /// 清理重复数据 71 /// </summary> 72 private static void CleanAll() 73 { 74 try 75 { 76 Console.WriteLine($"{ DateTime.Now} 开始清理重复数据 - -"); 77 StringBuilder sb = new StringBuilder(); 78 for (int i = 1; i < 31; i++) 79 { 80 sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1) 81 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000")); 82 } 83 #region 84 /* 85 DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1) 86 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1) 87 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1) 88 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1) 89 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1) 90 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1) 91 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1) 92 AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1) 93 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1) 94 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1) 95 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1) 96 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1) 97 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1) 98 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1) 99 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1) 100 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1) 101 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1) 102 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1) 103 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1) 104 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1) 105 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1) 106 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1) 107 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1) 108 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1) 109 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1) 110 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1) 111 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1) 112 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1) 113 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1) 114 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1) 115 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1); 116 */ 117 #endregion 118 Console.WriteLine("执行清理sql:{0}", sb.ToString()); 119 SqlHelper.ExecuteNonQuery(sb.ToString()); 120 Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now); 121 } 122 catch (Exception ex) 123 { 124 logger.Error("CleanAll出现异常", ex); 125 } 126 finally 127 { 128 Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now); 129 } 130 } 131 }
1 public interface ISearch 2 { 3 void Crawler(); 4 } 5 6 7 /// <summary> 8 /// 商品抓取 9 /// http://www.w3school.com.cn/xpath/index.asp XPATH语法 10 /// </summary> 11 public class CommoditySearch : ISearch 12 { 13 private Logger logger = new Logger(typeof(CommoditySearch)); 14 private WarnRepository warnRepository = new WarnRepository(); 15 private CommodityRepository commodityRepository = new CommodityRepository(); 16 private Category category = null; 17 18 public CommoditySearch(Category _category) 19 { 20 category = _category; 21 } 22 23 public void Crawler() 24 { 25 try 26 { 27 if (string.IsNullOrEmpty(category.Url)) 28 { 29 warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); 30 return; 31 } 32 string html = HttpHelper.DownloadUrl(category.Url);//下载html 33 34 HtmlDocument doc = new HtmlDocument(); 35 doc.LoadHtml(html);//加载html 36 string pageNumberPath = @"//*[@id='J_topPage']/span/i"; 37 HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath); 38 if (pageNumberNode != null) 39 { 40 string sNumber = pageNumberNode.InnerText; 41 for (int i = 1; i < int.Parse(sNumber) + 1; i++) 42 { 43 string pageUrl = string.Format("{0}&page={1}", category.Url, i); 44 try 45 { 46 List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i))); 47 //commodityRepository.SaveList(commodityList); 48 } 49 catch (Exception ex)//保证一页的错误不影响另外一页 50 { 51 logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex); 52 } 53 } 54 } 55 56 57 58 59 //string fristPath = "//*[@id='J_bottomPage']/span[1]/a"; 60 //HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(fristPath);//xPath分析 61 //if (noneNodeList == null) 62 //{ 63 // warnRepository.SaveWarn(category, string.Format("分页数据为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); 64 // return; 65 //} 66 67 //string pageUrl = null; 68 //foreach (var node in noneNodeList) 69 //{ 70 // string sNum = node.InnerHtml; 71 // if (sNum.Equals("1")) 72 // { 73 // pageUrl = node.Attributes["href"].Value.Replace("&", "&"); 74 // if (!pageUrl.StartsWith("http://")) 75 // pageUrl = string.Format("http://list.jd.com{0}", pageUrl); 76 // break; 77 // } 78 //} 79 //string sMaxPageNumPath = "//*[@id='J_bottomPage']/span[2]/em[1]/b"; 80 //HtmlNode sMaxPageNumPathNode = doc.DocumentNode.SelectSingleNode(sMaxPageNumPath); 81 //string sMaxPageNum = sMaxPageNumPathNode.InnerHtml; 82 //for (int i = 1; i < int.Parse(sMaxPageNum) + 1; i++) 83 //{ 84 // try 85 // { 86 // List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i))); 87 // commodityRepository.SaveList(commodityList); 88 // } 89 // catch (Exception ex)//保证一页的错误不影响另外一页 90 // { 91 // logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex); 92 // } 93 //} 94 } 95 catch (Exception ex) 96 { 97 logger.Error("CrawlerMuti出现异常", ex); 98 warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); 99 } 100 } 101 102 private List<Commodity> GetCommodityList(Category category, string url) 103 { 104 string html = HttpHelper.DownloadUrl(url); 105 List<Commodity> commodityList = new List<Commodity>(); 106 try 107 { 108 if (string.IsNullOrEmpty(html)) return commodityList; 109 HtmlDocument doc = new HtmlDocument(); 110 doc.LoadHtml(html); 111 string liPath = "//*[@id='plist']/ul/li"; 112 HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(liPath); 113 if (noneNodeList == null || noneNodeList.Count == 0) 114 { 115 warnRepository.SaveWarn(category, string.Format("GetCommodityList商品数据为空,Name={0} Level={1} category.Url={2} url={3}", category.Name, category.CategoryLevel, category.Url, url)); 116 return commodityList; 117 } 118 foreach (var node in noneNodeList) 119 { 120 HtmlDocument docChild = new HtmlDocument(); 121 docChild.LoadHtml(node.OuterHtml); 122 123 Commodity commodity = new Commodity() 124 { 125 CategoryId = category.Id 126 }; 127 128 string urlPath = "//*[@class='p-name']/a"; 129 HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath); 130 if (urlNode == null) 131 { 132 continue; 133 } 134 commodity.Url = urlNode.Attributes["href"].Value; 135 if (!commodity.Url.StartsWith("http:")) 136 commodity.Url = "http:" + commodity.Url; 137 138 string sId = Path.GetFileName(commodity.Url).Replace(".html", ""); 139 commodity.ProductId = long.Parse(sId); 140 141 //*[@id="plist"]/ul/li[1]/div/div[3]/a/em 142 string titlePath = "//*[@class='p-name']/a/em"; 143 HtmlNode titleNode = docChild.DocumentNode.SelectSingleNode(titlePath); 144 if (titleNode == null) 145 { 146 //Log.Error(titlePath); 147 continue; 148 } 149 commodity.Title = titleNode.InnerText; 150 151 string iamgePath = "//*[@class='p-img']/a/img"; 152 HtmlNode imageNode = docChild.DocumentNode.SelectSingleNode(iamgePath); 153 if (imageNode == null) 154 { 155 continue; 156 } 157 //前后不一 158 if (imageNode.Attributes.Contains("src")) 159 commodity.ImageUrl = imageNode.Attributes["src"].Value; 160 else if (imageNode.Attributes.Contains("original")) 161 commodity.ImageUrl = imageNode.Attributes["original"].Value; 162 else if (imageNode.Attributes.Contains("data-lazy-img")) 163 commodity.ImageUrl = imageNode.Attributes["data-lazy-img"].Value; 164 else 165 { 166 continue; 167 } 168 if (!commodity.ImageUrl.StartsWith("http:")) 169 commodity.ImageUrl = "http:" + commodity.ImageUrl; 170 171 string pricePath = "//*[@class='p-price']/strong/i"; 172 HtmlNode priceNode = docChild.DocumentNode.SelectSingleNode(pricePath); 173 if (priceNode == null) 174 { 175 continue; 176 } 177 else 178 { 179 } 180 commodityList.Add(commodity); 181 } 182 Console.WriteLine("{0}一共获取了{1}条数据", url, commodityList.Count); 183 } 184 catch (Exception ex) 185 { 186 logger.Error(string.Format("GetCommodityList出现异常,url={0}", url), ex); 187 } 188 return GetCommodityPrice(category, commodityList); 189 } 190 191 /// <summary> 192 /// 获取商品价格 193 /// </summary> 194 /// <param name="commodityList"></param> 195 /// <returns></returns> 196 private List<Commodity> GetCommodityPrice(Category category, List<Commodity> commodityList) 197 { 198 try 199 { 200 if (commodityList == null || commodityList.Count() == 0) 201 return commodityList; 202 203 StringBuilder sb = new StringBuilder(); 204 //sb.Append(@"http://p.3.cn/prices/mgets?my=list_price&type=1&area=1_72_4137&skuIds="); 205 //sb.Append(string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId)))); 206 // 207 sb.AppendFormat("http://p.3.cn/prices/mgets?callback=jQuery1069298&type=1&area=1_72_4137_0&skuIds={0}&pdbp=0&pdtk=&pdpin=&pduid=1945966343&_=1469022843655", string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId)))); 208 string html = HttpHelper.DownloadUrl(sb.ToString()); 209 if (string.IsNullOrWhiteSpace(html)) 210 { 211 logger.Warn(string.Format("获取url={0}时获取的html为空", sb.ToString())); 212 } 213 html = html.Substring(html.IndexOf("(") + 1); 214 html = html.Substring(0, html.LastIndexOf(")")); 215 List<CommodityPrice> priceList = JsonConvert.DeserializeObject<List<CommodityPrice>>(html); 216 commodityList.ForEach(c => c.Price = priceList.FirstOrDefault(p => p.id.Equals(string.Format("J_{0}", c.ProductId))).p); 217 //commodityList.ForEach(c => Console.WriteLine(" Title={0} ImageUrl={1} Url={2} Price={3} Id={4}", c.Title, c.ImageUrl, c.Url, c.Price, c.Id)); 218 } 219 catch (Exception ex) 220 { 221 logger.Error("GetCommodityPrice出现异常", ex); 222 } 223 return commodityList; 224 } 225 } 226 227 /// <summary> 228 /// http://www.w3school.com.cn/xpath/index.asp XPATH语法 229 /// </summary> 230 public class CategorySearch : ISearch 231 { 232 private static Logger logger = new Logger(typeof(CategorySearch)); 233 private int _Count = 1;//每次都得new一个 重新初始化类别 234 235 public void Crawler() 236 { 237 List<Category> categoryList = new List<Category>(); 238 try 239 { 240 string html = HttpHelper.DownloadUrl(Constant.JDCategoryUrl); 241 242 HtmlDocument doc = new HtmlDocument(); 243 doc.LoadHtml(html); 244 string fristPath = "//*[@class='category-item m']"; 245 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath); 246 int k = 1; 247 foreach (HtmlNode node in nodeList) 248 { 249 categoryList.AddRange(this.First(node.InnerHtml, k++.ToString("00") + "f", "root")); 250 } 251 252 CategoryRepository categoryRepository = new CategoryRepository(); 253 categoryRepository.Save(categoryList); 254 } 255 catch (Exception ex) 256 { 257 logger.Error("CrawlerMuti出现异常", ex); 258 } 259 finally 260 { 261 Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个"); 262 } 263 } 264 265 /// <summary> 266 /// 对每一个一级类进行查找 267 /// </summary> 268 /// <param name="html"></param> 269 /// <param name="code"></param> 270 /// <param name="parentCode"></param> 271 /// <returns></returns> 272 private List<Category> First(string html, string code, string parentCode) 273 { 274 List<Category> categoryList = new List<Category>(); 275 HtmlDocument doc = new HtmlDocument(); 276 doc.LoadHtml(html); 277 string path = "//*[@class='mt']/h2/span"; 278 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); 279 foreach (HtmlNode node in nodeList) 280 { 281 Category category = new Category() 282 { 283 Id = _Count++, 284 State = 0, 285 CategoryLevel = 1, 286 Code = code, 287 ParentCode = parentCode 288 }; 289 category.Name = node.InnerText; 290 category.Url = "";// node.Attributes["href"].Value; 291 categoryList.Add(category); 292 } 293 categoryList.AddRange(this.Second(html, code)); 294 return categoryList; 295 } 296 297 /// <summary> 298 /// 在一个一级类下面的全部二级类进行查找 299 /// </summary> 300 /// <param name="html"></param> 301 /// <param name="parentCode"></param> 302 /// <returns></returns> 303 private List<Category> Second(string html, string parentCode) 304 { 305 List<Category> categoryList = new List<Category>(); 306 HtmlDocument doc = new HtmlDocument(); 307 doc.LoadHtml(html); 308 string path = "//*[@class='items']/dl"; 309 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); 310 int k = 1; 311 foreach (HtmlNode node in nodeList) 312 { 313 string code = string.Format("{0}{1}s", parentCode, k.ToString("00")); 314 string secondHtml = node.InnerHtml; 315 if (string.IsNullOrWhiteSpace(secondHtml)) continue; 316 HtmlDocument secondDoc = new HtmlDocument(); 317 secondDoc.LoadHtml(secondHtml); 318 Category category = new Category() 319 { 320 Id = _Count++, 321 State = 0, 322 CategoryLevel = 2, 323 Code = code, 324 ParentCode = parentCode 325 }; 326 327 328 HtmlNode secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt/a"); 329 if (secondNode == null)//图书音像 330 { 331 secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt"); 332 } 333 category.Name = secondNode.InnerText; 334 if (secondNode.Attributes["href"] != null) 335 { 336 category.Url = secondNode.Attributes["href"].Value; 337 if (!category.Url.StartsWith("http:")) 338 { 339 category.Url = string.Concat("http:", category.Url); 340 } 341 } 342 categoryList.Add(category); 343 HtmlNode thirdNode = secondDoc.DocumentNode.SelectSingleNode("//dd"); 344 if (thirdNode == null) continue; 345 categoryList.AddRange(this.Third(thirdNode.InnerHtml, code)); 346 k++; 347 } 348 return categoryList; 349 } 350 351 /// <summary> 352 /// 在一个二级类下的全部三级类里面进行查找 353 /// </summary> 354 /// <param name="html"></param> 355 /// <param name="parentCode"></param> 356 /// <returns></returns> 357 private List<Category> Third(string html, string parentCode) 358 { 359 List<Category> categoryList = new List<Category>(); 360 HtmlDocument doc = new HtmlDocument(); 361 doc.LoadHtml(html); 362 string path = "//a"; 363 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); 364 if (nodeList == null || nodeList.Count == 0) return categoryList; 365 int k = 1; 366 foreach (HtmlNode node in nodeList) 367 { 368 string code = string.Format("{0}{1}t", parentCode, k.ToString("00")); 369 Category category = new Category() 370 { 371 Id = _Count++, 372 State = 0, 373 CategoryLevel = 3, 374 Code = code, 375 ParentCode = parentCode 376 }; 377 category.Name = node.InnerText; 378 category.Url = node.Attributes["href"].Value; 379 if (!category.Url.StartsWith("http:")) 380 { 381 category.Url = string.Concat("http:", category.Url); 382 } 383 categoryList.Add(category); 384 k++; 385 } 386 return categoryList; 387 } 388 }
6,Model
1 public class BaseModel 2 { 3 public int Id { get; set; } 4 } 5 6 public class Category:BaseModel 7 { 8 public string Code { get; set; } 9 public string ParentCode { get; set; } 10 public string Name { get; set; } 11 public string Url { get; set; } 12 public int CategoryLevel { get; set; } 13 public int State { get; set; } 14 } 15 16 public class Commodity : BaseModel 17 { 18 public long ProductId { get; set; } 19 public int CategoryId { get; set; } 20 public string Title { get; set; } 21 public decimal Price { get; set; } 22 public string Url { get; set; } 23 public string ImageUrl { get; set; } 24 } 25 26 27 //jQuery5427073([{"id":"J_1707419","p":"5149.00","m":"5499.00"},{"id":"J_1589214","p":"1999.00","m":"2999.00"},{"id":"J_1546310","p":"3999.00","m":"4999.00"},{"id":"J_1510479","p":"2999.00","m":"3569.00"},{"id":"J_1707420","p":"4149.00","m":"4499.00"},{"id":"J_1770620","p":"2099.00","m":"2499.00"},{"id":"J_1258277","p":"2699.00","m":"3299.00"},{"id":"J_1707423","p":"4599.00","m":"4705.00"},{"id":"J_1252778","p":"3099.00","m":"4199.00"},{"id":"J_1553732","p":"3298.00","m":"4598.00"},{"id":"J_1576022","p":"2999.00","m":"3999.00"},{"id":"J_1420120","p":"1999.00","m":"2899.00"},{"id":"J_647948","p":"1299.00","m":"1698.00"},{"id":"J_1044476","p":"1999.00","m":"2999.00"},{"id":"J_1376591","p":"1299.00","m":"1599.00"},{"id":"J_1416294","p":"4599.00","m":"5898.00"},{"id":"J_1455427","p":"1499.00","m":"1999.00"},{"id":"J_1253502","p":"2799.00","m":"3999.00"},{"id":"J_1553624","p":"2998.00","m":"4398.00"},{"id":"J_1301951","p":"2279.00","m":"3999.00"},{"id":"J_1115374","p":"2499.00","m":"4299.00"},{"id":"J_671315","p":"1999.00","m":"2898.00"},{"id":"J_1283945","p":"3099.00","m":"4199.00"},{"id":"J_1283940","p":"2499.00","m":"2999.00"},{"id":"J_1027317","p":"2799.00","m":"5999.00"},{"id":"J_1314962","p":"3699.00","m":"5199.00"},{"id":"J_1565150","p":"4068.00","m":"5727.00"},{"id":"J_1565175","p":"3788.00","m":"5377.00"},{"id":"J_1565182","p":"3938.00","m":"5757.00"},{"id":"J_1209084","p":"3599.00","m":"4999.00"}]); 28 /// <summary> 29 /// 为解析json 30 /// </summary> 31 public class CommodityPrice 32 { 33 public string id { get; set; } 34 public decimal p { get; set; } 35 public decimal m { get; set; } 36 }
7,DataService
(1)SqlHelper
1 public class SqlHelper 2 { 3 private static Logger logger = new Logger(typeof(SqlHelper)); 4 private static string _ConnStr = ConfigurationManager.ConnectionStrings["mvc5"].ConnectionString; 5 6 /// <summary> 7 /// 事务执行 8 /// </summary> 9 /// <param name="sql"></param> 10 public static void ExecuteNonQuery(string sql) 11 { 12 using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) 13 { 14 sqlConn.Open(); 15 SqlCommand cmd = new SqlCommand(sql, sqlConn); 16 cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// 17 } 18 } 19 20 public static void ExecuteNonQueryWithTrans(string sql) 21 { 22 SqlTransaction trans = null; 23 try 24 { 25 using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) 26 { 27 sqlConn.Open(); 28 trans = sqlConn.BeginTransaction(); 29 SqlCommand cmd = new SqlCommand(sql, sqlConn, trans); 30 cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// 31 trans.Commit(); 32 } 33 } 34 catch (Exception ex) 35 { 36 //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex); 37 if (trans != null && trans.Connection != null) 38 trans.Rollback(); 39 throw ex; 40 } 41 finally 42 { 43 } 44 } 45 46 public static List<T> QueryList<T>(string sql) where T : new() 47 { 48 using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) 49 { 50 sqlConn.Open(); 51 SqlCommand cmd = new SqlCommand(sql, sqlConn); 52 return TransList<T>(cmd.ExecuteReader()); 53 } 54 } 55 56 public static void Insert<T>(T model, string tableName) where T : new() 57 { 58 string sql = GetInsertSql<T>(model, tableName); 59 ExecuteNonQuery(sql); 60 } 61 62 public static void InsertList<T>(List<T> list, string tableName) where T : new() 63 { 64 string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName))); 65 ExecuteNonQuery(sql); 66 } 67 68 #region Private 69 private static string GetInsertSql<T>(T model, string tableName) 70 { 71 StringBuilder sbSql = new StringBuilder(); 72 73 StringBuilder sbFields = new StringBuilder(); 74 StringBuilder sbValues = new StringBuilder(); 75 76 Type type = model.GetType(); 77 var properties = type.GetProperties(); 78 foreach (PropertyInfo p in properties) 79 { 80 string name = p.Name; 81 if (!name.Equals("id", StringComparison.OrdinalIgnoreCase)) 82 { 83 sbFields.AppendFormat("[{0}],", name); 84 string sValue = null; 85 object oValue = p.GetValue(model); 86 if (oValue != null) 87 sValue = oValue.ToString().Replace("'", ""); 88 sbValues.AppendFormat("'{0}',", sValue); 89 } 90 } 91 sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(',')); 92 return sbSql.ToString(); 93 } 94 95 private static List<T> TransList<T>(SqlDataReader reader) where T : new() 96 { 97 List<T> tList = new List<T>(); 98 Type type = typeof(T); 99 var properties = type.GetProperties(); 100 if (reader.Read()) 101 { 102 do 103 { 104 T t = new T(); 105 foreach (PropertyInfo p in properties) 106 { 107 p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); 108 } 109 tList.Add(t); 110 } 111 while (reader.Read()); 112 } 113 return tList; 114 } 115 116 private static T TransModel<T>(SqlDataReader reader) where T : new() 117 { 118 T t = new T(); 119 if (reader.Read()) 120 { 121 do 122 { 123 Type type = typeof(T); 124 var properties = type.GetProperties(); 125 foreach (PropertyInfo p in properties) 126 { 127 p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); 128 } 129 } 130 while (reader.Read()); 131 } 132 return t; 133 } 134 #endregion Private 135 }
1 namespace Ruanmou.Crawler.DataService 2 { 3 /// <summary> 4 /// 数据库结构初始化 5 /// 改进下:直接判断表是否存在,而不是等着异常 6 /// </summary> 7 public class DBInit 8 { 9 private static Logger logger = new Logger(typeof(DBInit)); 10 11 /// <summary> 12 /// 谨慎使用 会全部删除数据库并重新创建! 13 /// </summary> 14 public static void InitCommodityTable() 15 { 16 #region Delete 17 try 18 { 19 StringBuilder sb = new StringBuilder(); 20 for (int i = 1; i < 31; i++) 21 { 22 sb.AppendFormat("DROP TABLE [dbo].[JD_Commodity_{0}];", i.ToString("000")); 23 } 24 SqlHelper.ExecuteNonQuery(sb.ToString()); 25 } 26 catch (Exception ex) 27 { 28 if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。")) 29 { 30 logger.Warn("初始化数据库InitCommodityTable删除的时候,原表不存在"); 31 } 32 else 33 { 34 logger.Error("初始化数据库InitCommodityTable失败", ex); 35 throw ex; 36 } 37 } 38 #endregion Delete 39 40 #region Create 41 try 42 { 43 StringBuilder sb = new StringBuilder(); 44 for (int i = 1; i < 31; i++) 45 { 46 sb.AppendFormat(@"CREATE TABLE [dbo].[JD_Commodity_{0}]( 47 [Id] [int] IDENTITY(1,1) NOT NULL, 48 [ProductId] [bigint] NULL, 49 [CategoryId] [int] NULL, 50 [Title] [nvarchar](500) NULL, 51 [Price] [decimal](18, 2) NULL, 52 [Url] [varchar](1000) NULL, 53 [ImageUrl] [varchar](1000) NULL, 54 CONSTRAINT [PK_JD_Commodity_{0}] PRIMARY KEY CLUSTERED 55 ( 56 [Id] ASC 57 )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] 58 ) ON [PRIMARY];", i.ToString("000")); 59 } 60 SqlHelper.ExecuteNonQuery(sb.ToString()); 61 } 62 catch (Exception ex) 63 { 64 logger.Error("InitCommodityTable创建异常", ex); 65 throw ex; 66 } 67 #endregion Create 68 } 69 70 /// <summary> 71 /// 谨慎使用 会全部删除数据库并重新创建! 72 /// </summary> 73 public static void InitCategoryTable() 74 { 75 #region Delete 76 try 77 { 78 StringBuilder sb = new StringBuilder(); 79 sb.AppendFormat("DROP TABLE [dbo].[Category];"); 80 SqlHelper.ExecuteNonQuery(sb.ToString()); 81 } 82 catch (Exception ex) 83 { 84 if (ex.Message.Equals("无法对 表 'dbo.Category' 执行 删除,因为它不存在,或者您没有所需的权限。")) 85 { 86 logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在"); 87 } 88 else 89 { 90 logger.Error("初始化数据库InitCategoryTable失败", ex); 91 throw ex; 92 } 93 } 94 #endregion Delete 95 96 #region Create 97 try 98 { 99 StringBuilder sb = new StringBuilder(); 100 sb.AppendFormat(@"CREATE TABLE [dbo].[Category]( 101 [Id] [int] IDENTITY(1,1) NOT NULL, 102 [Code] [varchar](100) NULL, 103 [ParentCode] [varchar](100) NULL, 104 [CategoryLevel] [int] NULL, 105 [Name] [nvarchar](50) NULL, 106 [Url] [varchar](1000) NULL, 107 [State] [int] NULL, 108 CONSTRAINT [PK_Category] PRIMARY KEY CLUSTERED 109 ( 110 [Id] ASC 111 )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] 112 ) ON [PRIMARY];"); 113 114 SqlHelper.ExecuteNonQuery(sb.ToString()); 115 } 116 catch (Exception ex) 117 { 118 logger.Error("初始化数据库InitCategoryTable 创建失败", ex); 119 throw ex; 120 } 121 #endregion Create 122 123 } 124 } 125 }
1 public interface IRepository<T> where T : class//, new() 2 { 3 void Save(T entity); 4 void SaveList(List<T> entity); 5 } 6 7 public class CommodityRepository //: IRepository<Commodity> 8 { 9 private Logger logger = new Logger(typeof(CommodityRepository)); 10 11 public void SaveList(List<Commodity> commodityList) 12 { 13 if (commodityList == null || commodityList.Count == 0) return; 14 IEnumerable<IGrouping<string, Commodity>> group = commodityList.GroupBy<Commodity, string>(c => GetTableName(c)); 15 16 foreach (var data in group) 17 { 18 SqlHelper.InsertList<Commodity>(data.ToList(), data.Key); 19 } 20 } 21 22 private string GetTableName(Commodity commodity) 23 { 24 return string.Format("JD_Commodity_{0}", (commodity.ProductId % 30 + 1).ToString("000")); 25 } 26 27 /// <summary> 28 /// 保存文本记录 29 /// </summary> 30 /// <param name="commodityList"></param> 31 /// <param name="category"></param> 32 /// <param name="page"></param> 33 public void SaveList(List<Commodity> commodityList, Category category, int page) 34 { 35 StreamWriter sw = null; 36 try 37 { 38 string recordFileName = string.Format($"{category.CategoryLevel}/{category.ParentCode}/{category.Id}/{page}.txt"); 39 string totolPath = Path.Combine(Constant.DataPath, recordFileName); 40 if (!Directory.Exists(Path.GetDirectoryName(totolPath))) 41 { 42 Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); 43 sw = File.CreateText(totolPath); 44 } 45 else 46 { 47 sw = File.AppendText(totolPath); 48 } 49 sw.WriteLine(JsonConvert.SerializeObject(commodityList)); 50 } 51 catch (Exception e) 52 { 53 logger.Error("CommodityRepository.SaveList出现异常", e); 54 } 55 finally 56 { 57 if (sw != null) 58 { 59 sw.Flush(); 60 sw.Close(); 61 sw.Dispose(); 62 } 63 } 64 } 65 } 66 67 68 public class CategoryRepository //: IRepository<Commodity> 69 { 70 private Logger logger = new Logger(typeof(CategoryRepository)); 71 72 public void Save(List<Category> categoryList) 73 { 74 SqlHelper.InsertList<Category>(categoryList, "Category"); 75 new Action<List<Category>>(SaveList).BeginInvoke(categoryList, null, null); 76 } 77 78 /// <summary> 79 /// 根据Level获取类别列表 80 /// </summary> 81 /// <param name="level"></param> 82 /// <returns></returns> 83 public List<Category> QueryListByLevel(int level) 84 { 85 string sql = string.Format("SELECT * FROM category WHERE categorylevel={0};", level); 86 return SqlHelper.QueryList<Category>(sql); 87 } 88 89 90 /// <summary> 91 /// 存文本记录的 92 /// </summary> 93 /// <param name="categoryList"></param> 94 public void SaveList(List<Category> categoryList) 95 { 96 StreamWriter sw = null; 97 try 98 { 99 string recordFileName = string.Format("{0}_Category.txt", DateTime.Now.ToString("yyyyMMddHHmmss")); 100 string totolPath = Path.Combine(Constant.DataPath, recordFileName); 101 if (!Directory.Exists(Path.GetDirectoryName(totolPath))) 102 { 103 Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); 104 sw = File.CreateText(totolPath); 105 } 106 else 107 { 108 sw = File.AppendText(totolPath); 109 } 110 111 sw.WriteLine(JsonConvert.SerializeObject(categoryList)); 112 } 113 catch (Exception e) 114 { 115 logger.Error("CategoryRepository.SaveList出现异常", e); 116 } 117 finally 118 { 119 if (sw != null) 120 { 121 sw.Flush(); 122 sw.Close(); 123 sw.Dispose(); 124 } 125 } 126 } 127 } 128 129 public class WarnRepository //: IRepository<Commodity> 130 { 131 private Logger logger = new Logger(typeof(WarnRepository)); 132 public void SaveWarn(Category category, string msg) 133 { 134 StreamWriter sw = null; 135 try 136 { 137 string recordFileName = string.Format("warn/{0}/{1}/{2}.txt", category.CategoryLevel, category.ParentCode, category.Id); 138 string totolPath = Path.Combine(Constant.DataPath, recordFileName); 139 if (!Directory.Exists(Path.GetDirectoryName(totolPath))) 140 { 141 Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); 142 sw = File.CreateText(totolPath); 143 } 144 else 145 { 146 sw = File.AppendText(totolPath); 147 } 148 sw.WriteLine(msg); 149 sw.WriteLine(JsonConvert.SerializeObject(JsonConvert.SerializeObject(category))); 150 } 151 catch (Exception e) 152 { 153 logger.Error("SaveWarn出现异常", e); 154 } 155 finally 156 { 157 if (sw != null) 158 { 159 sw.Flush(); 160 sw.Close(); 161 sw.Dispose(); 162 } 163 } 164 } 165 }
京东爬虫使用说明:
- 配置app.config的DataPath(这个是文本数据存储的地址);
- 创建一个sqlerver数据库,配置app.config的数据库连接;
- 运行项目,可以直接vs运行,或者使用CrawlerinDebug下面的Crawler.exe;
- 数据库表结构是自动创建的,控制台需要输入Y才能开始初始化数据库结构,然后进行数据抓取;
- 看看控制台有无提示异常,看看数据库的数据即可。
类别大概是1300+ 商品是800W+