• 爬虫


      项目总览

      

      1,log4net

     1 <?xml version="1.0" encoding="utf-8"?>
     2 <log4net>
     3     <!-- Define some output appenders -->
     4     <appender name="rollingAppender" type="log4net.Appender.RollingFileAppender">
     5         <file value="loglog.txt" />
     6 
     7         <!--追加日志内容-->
     8         <appendToFile value="true" />
     9 
    10         <!--防止多线程时不能写Log,官方说线程非安全-->
    11         <lockingModel type="log4net.Appender.FileAppender+MinimalLock" />
    12 
    13         <!--可以为:Once|Size|Date|Composite-->
    14         <!--Composite为Size和Date的组合-->
    15         <rollingStyle value="Composite" />
    16 
    17         <!--当备份文件时,为文件名加的后缀-->
    18         <datePattern value="yyyyMMdd.TXT" />
    19 
    20         <!--日志最大个数,都是最新的-->
    21         <!--rollingStyle节点为Size时,只能有value个日志-->
    22         <!--rollingStyle节点为Composite时,每天有value个日志-->
    23         <maxSizeRollBackups value="20" />
    24 
    25         <!--可用的单位:KB|MB|GB-->
    26         <maximumFileSize value="3MB" />
    27 
    28         <!--置为true,当前最新日志文件名永远为file节中的名字-->
    29         <staticLogFileName value="true" />
    30 
    31         <!--输出级别在INFO和ERROR之间的日志-->
    32         <filter type="log4net.Filter.LevelRangeFilter">
    33             <param name="LevelMin" value="INFO" />
    34             <param name="LevelMax" value="FATAL" />
    35         </filter>
    36 
    37         <layout type="log4net.Layout.PatternLayout">
    38             <conversionPattern value="%date [%thread] %-5level %logger - %message%newline"/>
    39         </layout>
    40     </appender>
    41 
    42     <!-- levels: OFF > FATAL > ERROR > WARN > INFO > DEBUG  > ALL -->
    43     <root>
    44         <priority value="ALL"/>
    45         <level value="ALL"/>
    46         <appender-ref ref="rollingAppender" />
    47     </root>
    48 </log4net>
    View Code
     1     public class Logger
     2     {
     3         static Logger()
     4         {
     5             XmlConfigurator.Configure(new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "CfgFiles\log4net.cfg.xml")));
     6             ILog Log = LogManager.GetLogger(typeof(Logger));
     7             Log.Info("系统初始化Logger模块");
     8         }
     9 
    10         private ILog loger = null;
    11         public Logger(Type type)
    12         {
    13             loger = LogManager.GetLogger(type);
    14         }
    15 
    16         /// <summary>
    17         /// Log4日志
    18         /// </summary>
    19         /// <param name="msg"></param>
    20         /// <param name="ex"></param>
    21         public void Error(string msg = "出现异常", Exception ex = null)
    22         {
    23             Console.WriteLine(msg);
    24             loger.Error(msg, ex);
    25         }
    26 
    27         /// <summary>
    28         /// Log4日志
    29         /// </summary>
    30         /// <param name="msg"></param>
    31         public void Warn(string msg)
    32         {
    33             Console.WriteLine(msg);
    34             loger.Warn(msg);
    35         }
    36 
    37         /// <summary>
    38         /// Log4日志
    39         /// </summary>
    40         /// <param name="msg"></param>
    41         public void Info(string msg)
    42         {
    43             Console.WriteLine(msg);
    44             loger.Info(msg);
    45         }
    46 
    47         /// <summary>
    48         /// Log4日志
    49         /// </summary>
    50         /// <param name="msg"></param>
    51         public void Debug(string msg )
    52         {
    53             Console.WriteLine(msg);
    54             loger.Debug(msg);
    55         }
    56     }
    View Code

      2,系统配置项

     1     /// <summary>
     2     /// 系统配置项
     3     /// </summary>
     4     public class Constant
     5     {
     6         /// <summary>
     7         /// 数据文件保存路径
     8         /// </summary>
     9         public static string DataPath = ConfigurationManager.AppSettings["DataPath"];
    10         /// <summary>
    11         /// 京东类别入口
    12         /// </summary>
    13         public static string JDCategoryUrl = ConfigurationManager.AppSettings["JDCategoryUrl"];
    14     }
    View Code

      app.config

     1 <?xml version="1.0" encoding="utf-8" ?>
     2 <configuration>
     3     <startup> 
     4         <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
     5     </startup>
     6   <appSettings>
     7     <add key="DataPath" value="D:
    uanmouonline920170711Advanced9Course16CrawlerRuanmou.CrawlerRuanmou.CrawlerinDebugData"/>
     8     <add key="JDCategoryUrl" value="http://www.jd.com/allSort.aspx"/>
     9   </appSettings>
    10   <connectionStrings>
    11     <add name="mvc5" connectionString="Data Source=ElevenPC; Database=advanced9; User ID=sa; Password=Passw0rd; MultipleActiveResultSets=True" providerName="System.Data.SqlClient" />
    12   </connectionStrings>
    13 </configuration>
    View Code

      3,Program

     1     /// <summary>
     2     /// 1 爬虫,爬虫攻防
     3     /// 2 下载html
     4     /// 3 xpath解析html,获取数据和深度抓取
     5     /// 4 不一样的属性和ajax数据的获取
     6     /// 5 多线程爬虫
     7     /// </summary>
     8     class Program
     9     {
    10         private static Logger logger = new Logger(typeof(Program));
    11         static void Main(string[] args)
    12         {
    13             try
    14             {
    15                 Console.WriteLine("欢迎来到.net高级班vip课程,今天是Eleven老师为大家带来的爬虫的学习");
    16 
    17                 #region 测试DownloadHtml
    18                 string html = HttpHelper.DownloadHtml(@"https://list.jd.com/list.html?cat=9987,653,655", Encoding.UTF8);
    19                 #endregion
    20 
    21                 #region 测试获取分类页
    22                 //string html1 = HttpHelper.DownloadHtml(Constant.JDCategoryUrl, Encoding.UTF8);
    23                 #endregion
    24 
    25 
    26                 #region 测试抓取商品列表
    27                 string testCategory = "{"Id":73,"Code":"02f01s01T","ParentCode":"02f01s","Name":"烟机/灶具","Url":"http://list.jd.com/list.html?cat=737,13297,1300","Level":3}";
    28                 Category category = JsonConvert.DeserializeObject<Category>(testCategory);
    29                 ISearch search = new CommoditySearch(category);
    30                 search.Crawler();
    31                 #endregion
    32 
    33                 #region 抓取
    34                 CrawlerCenter.Handler();
    35                 #endregion
    36 
    37             }
    38             catch (Exception ex)
    39             {
    40                 logger.Error("异常啦,", ex);
    41                 Console.WriteLine("*****************木有成功**********************");
    42             }
    43             Console.ReadLine();
    44         }
    45     }
    View Code

      4,HttpHelper

     1     /// <summary>
     2     /// http://tool.sufeinet.com/HttpHelper.aspx
     3     /// </summary>
     4     public class HttpHelper
     5     {
     6         private static Logger logger = new Logger(typeof(HttpHelper));
     7 
     8         /// <summary>
     9         /// 根据url下载内容  之前是GB2312
    10         /// </summary>
    11         /// <param name="url"></param>
    12         /// <returns></returns>
    13         public static string DownloadUrl(string url)
    14         {
    15             return DownloadHtml(url, Encoding.UTF8);
    16         }
    17 
    18         /// <summary>
    19         /// 下载html
    20         /// http://tool.sufeinet.com/HttpHelper.aspx
    21         /// HttpWebRequest功能比较丰富,WebClient使用比较简单
    22         /// </summary>
    23         /// <param name="url"></param>
    24         /// <returns></returns>
    25         public static string DownloadHtml(string url, Encoding encode)
    26         {
    27             string html = string.Empty;
    28             try
    29             {
    30                 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
    31                 request.Timeout = 30 * 1000;//设置30s的超时
    32                 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
    33                 request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 
    34                 //request.Host = "search.yhd.com";
    35 
    36                 //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");
    37 
    38                 //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
    39                 //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
    40                 //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");
    41 
    42                 //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
    43 
    44                 using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
    45                 {
    46                     if (response.StatusCode != HttpStatusCode.OK)
    47                     {
    48                         logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
    49                     }
    50                     else
    51                     {
    52                         try
    53                         {
    54                             StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
    55                             html = sr.ReadToEnd();//读取数据
    56                             sr.Close();
    57                         }
    58                         catch (Exception ex)
    59                         {
    60                             logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
    61                             html = null;
    62                         }
    63                     }
    64                 }
    65             }
    66             catch (System.Net.WebException ex)
    67             {
    68                 if (ex.Message.Equals("远程服务器返回错误: (306)。"))
    69                 {
    70                     logger.Error("远程服务器返回错误: (306)。", ex);
    71                     html = null;
    72                 }
    73             }
    74             catch (Exception ex)
    75             {
    76                 logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
    77                 html = null;
    78             }
    79             return html;
    80         }
    81     }
    View Code

      5,爬虫

      1     public class CrawlerCenter
      2     {
      3         private static Logger logger = new Logger(typeof(CrawlerCenter));
      4 
      5         /// <summary>
      6         /// 抓取
      7         /// </summary>
      8         public static void Handler()
      9         {
     10             Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过");
     11             string input = Console.ReadLine();
     12             if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
     13             {
     14                 DBInit.InitCategoryTable();
     15                 CrawlerCategory();
     16             }
     17             else
     18             {
     19                 Console.WriteLine("你选择不初始化类别数据");
     20             }
     21             Console.WriteLine("*****************^_^**********************");
     22 
     23 
     24 
     25             Console.WriteLine("请输入Y/N进行商品数据初始化确认! Y 删除全部商品表表然后重新创建,然后抓取商品数据,N(或者其他)跳过");
     26             input = Console.ReadLine();
     27             if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
     28             {
     29                 DBInit.InitCommodityTable();
     30                 CrawlerCommodity();
     31             }
     32             Console.WriteLine("*****************^_^**********************");
     33         }
     34 
     35         private static void CrawlerCategory()
     36         {
     37             Console.WriteLine($"{ DateTime.Now} jd商品类别开始抓取 - -");
     38             ISearch search = new CategorySearch();
     39             search.Crawler();
     40         }
     41 
     42         /// <summary>
     43         /// 抓取商品
     44         /// </summary>
     45         private static void CrawlerCommodity()
     46         {
     47             Console.WriteLine($"{ DateTime.Now} jd商品开始抓取 - -");
     48             CategoryRepository categoryRepository = new CategoryRepository();
     49             List<Category> categoryList = categoryRepository.QueryListByLevel(3);
     50 
     51             List<Task> taskList = new List<Task>();
     52             TaskFactory taskFactory = new TaskFactory();
     53             foreach (Category category in categoryList)
     54             {
     55                 ISearch searcher = new CommoditySearch(category);
     56                 //searcher.Crawler();
     57                 taskList.Add(taskFactory.StartNew(searcher.Crawler));
     58                 if (taskList.Count > 15)
     59                 {
     60                     taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList();
     61                     Task.WaitAny(taskList.ToArray());
     62                 }
     63             }
     64             Task.WaitAll(taskList.ToArray());
     65             Console.WriteLine($"{ DateTime.Now} jd商品抓取全部完成 - -");
     66             CleanAll();
     67         }
     68 
     69         /// <summary>
     70         /// 清理重复数据
     71         /// </summary>
     72         private static void CleanAll()
     73         {
     74             try
     75             {
     76                 Console.WriteLine($"{ DateTime.Now} 开始清理重复数据 - -");
     77                 StringBuilder sb = new StringBuilder();
     78                 for (int i = 1; i < 31; i++)
     79                 {
     80                     sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1)
     81                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000"));
     82                 }
     83                 #region
     84                 /*
     85                  DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1)
     86                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1)
     87                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1)
     88                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1)
     89                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1)
     90                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1)
     91                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1)
     92                                 AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1)
     93                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1)
     94                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1)
     95                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1)
     96                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1)
     97                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1)
     98                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1)
     99                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1)
    100                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1)
    101                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1)
    102                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1)
    103                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1)
    104                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1)
    105                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1)
    106                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1)
    107                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1)
    108                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1)
    109                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1)
    110                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1)
    111                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1)
    112                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1)
    113                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1)
    114                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1)
    115                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1);
    116                  */
    117                 #endregion
    118                 Console.WriteLine("执行清理sql:{0}", sb.ToString());
    119                 SqlHelper.ExecuteNonQuery(sb.ToString());
    120                 Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now);
    121             }
    122             catch (Exception ex)
    123             {
    124                 logger.Error("CleanAll出现异常", ex);
    125             }
    126             finally
    127             {
    128                 Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now);
    129             }
    130         }
    131     }
    View Code
      1     public interface ISearch
      2     {
      3         void Crawler();
      4     }
      5 
      6 
      7     /// <summary>
      8     /// 商品抓取
      9     /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
     10     /// </summary>
     11     public class CommoditySearch : ISearch
     12     {
     13         private Logger logger = new Logger(typeof(CommoditySearch));
     14         private WarnRepository warnRepository = new WarnRepository();
     15         private CommodityRepository commodityRepository = new CommodityRepository();
     16         private Category category = null;
     17 
     18         public CommoditySearch(Category _category)
     19         {
     20             category = _category;
     21         }
     22 
     23         public void Crawler()
     24         {
     25             try
     26             {
     27                 if (string.IsNullOrEmpty(category.Url))
     28                 {
     29                     warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
     30                     return;
     31                 }
     32                 string html = HttpHelper.DownloadUrl(category.Url);//下载html
     33 
     34                 HtmlDocument doc = new HtmlDocument();
     35                 doc.LoadHtml(html);//加载html
     36                 string pageNumberPath = @"//*[@id='J_topPage']/span/i";
     37                 HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath);
     38                 if (pageNumberNode != null)
     39                 {
     40                     string sNumber = pageNumberNode.InnerText;
     41                     for (int i = 1; i < int.Parse(sNumber) + 1; i++)
     42                     {
     43                         string pageUrl = string.Format("{0}&page={1}", category.Url, i);
     44                         try
     45                         {
     46                             List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i)));
     47                             //commodityRepository.SaveList(commodityList);
     48                         }
     49                         catch (Exception ex)//保证一页的错误不影响另外一页
     50                         {
     51                             logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex);
     52                         }
     53                     }
     54                 }
     55 
     56 
     57 
     58 
     59                 //string fristPath = "//*[@id='J_bottomPage']/span[1]/a";
     60                 //HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(fristPath);//xPath分析
     61                 //if (noneNodeList == null)
     62                 //{
     63                 //    warnRepository.SaveWarn(category, string.Format("分页数据为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
     64                 //    return;
     65                 //}
     66 
     67                 //string pageUrl = null;
     68                 //foreach (var node in noneNodeList)
     69                 //{
     70                 //    string sNum = node.InnerHtml;
     71                 //    if (sNum.Equals("1"))
     72                 //    {
     73                 //        pageUrl = node.Attributes["href"].Value.Replace("&amp;", "&");
     74                 //        if (!pageUrl.StartsWith("http://"))
     75                 //            pageUrl = string.Format("http://list.jd.com{0}", pageUrl);
     76                 //        break;
     77                 //    }
     78                 //}
     79                 //string sMaxPageNumPath = "//*[@id='J_bottomPage']/span[2]/em[1]/b";
     80                 //HtmlNode sMaxPageNumPathNode = doc.DocumentNode.SelectSingleNode(sMaxPageNumPath);
     81                 //string sMaxPageNum = sMaxPageNumPathNode.InnerHtml;
     82                 //for (int i = 1; i < int.Parse(sMaxPageNum) + 1; i++)
     83                 //{
     84                 //    try
     85                 //    {
     86                 //        List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i)));
     87                 //        commodityRepository.SaveList(commodityList);
     88                 //    }
     89                 //    catch (Exception ex)//保证一页的错误不影响另外一页
     90                 //    {
     91                 //        logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex);
     92                 //    }
     93                 //}
     94             }
     95             catch (Exception ex)
     96             {
     97                 logger.Error("CrawlerMuti出现异常", ex);
     98                 warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
     99             }
    100         }
    101 
    102         private List<Commodity> GetCommodityList(Category category, string url)
    103         {
    104             string html = HttpHelper.DownloadUrl(url);
    105             List<Commodity> commodityList = new List<Commodity>();
    106             try
    107             {
    108                 if (string.IsNullOrEmpty(html)) return commodityList;
    109                 HtmlDocument doc = new HtmlDocument();
    110                 doc.LoadHtml(html);
    111                 string liPath = "//*[@id='plist']/ul/li";
    112                 HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(liPath);
    113                 if (noneNodeList == null || noneNodeList.Count == 0)
    114                 {
    115                     warnRepository.SaveWarn(category, string.Format("GetCommodityList商品数据为空,Name={0} Level={1} category.Url={2} url={3}", category.Name, category.CategoryLevel, category.Url, url));
    116                     return commodityList;
    117                 }
    118                 foreach (var node in noneNodeList)
    119                 {
    120                     HtmlDocument docChild = new HtmlDocument();
    121                     docChild.LoadHtml(node.OuterHtml);
    122 
    123                     Commodity commodity = new Commodity()
    124                     {
    125                         CategoryId = category.Id
    126                     };
    127 
    128                     string urlPath = "//*[@class='p-name']/a";
    129                     HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    130                     if (urlNode == null)
    131                     {
    132                         continue;
    133                     }
    134                     commodity.Url = urlNode.Attributes["href"].Value;
    135                     if (!commodity.Url.StartsWith("http:"))
    136                         commodity.Url = "http:" + commodity.Url;
    137 
    138                     string sId = Path.GetFileName(commodity.Url).Replace(".html", "");
    139                     commodity.ProductId = long.Parse(sId);
    140 
    141                     //*[@id="plist"]/ul/li[1]/div/div[3]/a/em
    142                     string titlePath = "//*[@class='p-name']/a/em";
    143                     HtmlNode titleNode = docChild.DocumentNode.SelectSingleNode(titlePath);
    144                     if (titleNode == null)
    145                     {
    146                         //Log.Error(titlePath);
    147                         continue;
    148                     }
    149                     commodity.Title = titleNode.InnerText;
    150 
    151                     string iamgePath = "//*[@class='p-img']/a/img";
    152                     HtmlNode imageNode = docChild.DocumentNode.SelectSingleNode(iamgePath);
    153                     if (imageNode == null)
    154                     {
    155                         continue;
    156                     }
    157                     //前后不一
    158                     if (imageNode.Attributes.Contains("src"))
    159                         commodity.ImageUrl = imageNode.Attributes["src"].Value;
    160                     else if (imageNode.Attributes.Contains("original"))
    161                         commodity.ImageUrl = imageNode.Attributes["original"].Value;
    162                     else if (imageNode.Attributes.Contains("data-lazy-img"))
    163                         commodity.ImageUrl = imageNode.Attributes["data-lazy-img"].Value;
    164                     else
    165                     {
    166                         continue;
    167                     }
    168                     if (!commodity.ImageUrl.StartsWith("http:"))
    169                         commodity.ImageUrl = "http:" + commodity.ImageUrl;
    170 
    171                     string pricePath = "//*[@class='p-price']/strong/i";
    172                     HtmlNode priceNode = docChild.DocumentNode.SelectSingleNode(pricePath);
    173                     if (priceNode == null)
    174                     {
    175                         continue;
    176                     }
    177                     else
    178                     {
    179                     }
    180                     commodityList.Add(commodity);
    181                 }
    182                 Console.WriteLine("{0}一共获取了{1}条数据", url, commodityList.Count);
    183             }
    184             catch (Exception ex)
    185             {
    186                 logger.Error(string.Format("GetCommodityList出现异常,url={0}", url), ex);
    187             }
    188             return GetCommodityPrice(category, commodityList);
    189         }
    190 
    191         /// <summary>
    192         /// 获取商品价格
    193         /// </summary>
    194         /// <param name="commodityList"></param>
    195         /// <returns></returns>
    196         private List<Commodity> GetCommodityPrice(Category category, List<Commodity> commodityList)
    197         {
    198             try
    199             {
    200                 if (commodityList == null || commodityList.Count() == 0)
    201                     return commodityList;
    202 
    203                 StringBuilder sb = new StringBuilder();
    204                 //sb.Append(@"http://p.3.cn/prices/mgets?my=list_price&type=1&area=1_72_4137&skuIds=");
    205                 //sb.Append(string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId))));
    206                 //
    207                 sb.AppendFormat("http://p.3.cn/prices/mgets?callback=jQuery1069298&type=1&area=1_72_4137_0&skuIds={0}&pdbp=0&pdtk=&pdpin=&pduid=1945966343&_=1469022843655", string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId))));
    208                 string html = HttpHelper.DownloadUrl(sb.ToString());
    209                 if (string.IsNullOrWhiteSpace(html))
    210                 {
    211                     logger.Warn(string.Format("获取url={0}时获取的html为空", sb.ToString()));
    212                 }
    213                 html = html.Substring(html.IndexOf("(") + 1);
    214                 html = html.Substring(0, html.LastIndexOf(")"));
    215                 List<CommodityPrice> priceList = JsonConvert.DeserializeObject<List<CommodityPrice>>(html);
    216                 commodityList.ForEach(c => c.Price = priceList.FirstOrDefault(p => p.id.Equals(string.Format("J_{0}", c.ProductId))).p);
    217                 //commodityList.ForEach(c => Console.WriteLine(" Title={0}  ImageUrl={1} Url={2} Price={3} Id={4}", c.Title, c.ImageUrl, c.Url, c.Price, c.Id));
    218             }
    219             catch (Exception ex)
    220             {
    221                 logger.Error("GetCommodityPrice出现异常", ex);
    222             }
    223             return commodityList;
    224         }
    225     }
    226 
    227     /// <summary>
    228     /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
    229     /// </summary>
    230     public class CategorySearch : ISearch
    231     {
    232         private static Logger logger = new Logger(typeof(CategorySearch));
    233         private int _Count = 1;//每次都得new一个 重新初始化类别
    234 
    235         public void Crawler()
    236         {
    237             List<Category> categoryList = new List<Category>();
    238             try
    239             {
    240                 string html = HttpHelper.DownloadUrl(Constant.JDCategoryUrl);
    241 
    242                 HtmlDocument doc = new HtmlDocument();
    243                 doc.LoadHtml(html);
    244                 string fristPath = "//*[@class='category-item m']";
    245                 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath);
    246                 int k = 1;
    247                 foreach (HtmlNode node in nodeList)
    248                 {
    249                     categoryList.AddRange(this.First(node.InnerHtml, k++.ToString("00") + "f", "root"));
    250                 }
    251 
    252                 CategoryRepository categoryRepository = new CategoryRepository();
    253                 categoryRepository.Save(categoryList);
    254             }
    255             catch (Exception ex)
    256             {
    257                 logger.Error("CrawlerMuti出现异常", ex);
    258             }
    259             finally
    260             {
    261                 Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个");
    262             }
    263         }
    264 
    265         /// <summary>
    266         /// 对每一个一级类进行查找
    267         /// </summary>
    268         /// <param name="html"></param>
    269         /// <param name="code"></param>
    270         /// <param name="parentCode"></param>
    271         /// <returns></returns>
    272         private List<Category> First(string html, string code, string parentCode)
    273         {
    274             List<Category> categoryList = new List<Category>();
    275             HtmlDocument doc = new HtmlDocument();
    276             doc.LoadHtml(html);
    277             string path = "//*[@class='mt']/h2/span";
    278             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
    279             foreach (HtmlNode node in nodeList)
    280             {
    281                 Category category = new Category()
    282                 {
    283                     Id = _Count++,
    284                     State = 0,
    285                     CategoryLevel = 1,
    286                     Code = code,
    287                     ParentCode = parentCode
    288                 };
    289                 category.Name = node.InnerText;
    290                 category.Url = "";// node.Attributes["href"].Value;
    291                 categoryList.Add(category);
    292             }
    293             categoryList.AddRange(this.Second(html, code));
    294             return categoryList;
    295         }
    296 
    297         /// <summary>
    298         /// 在一个一级类下面的全部二级类进行查找
    299         /// </summary>
    300         /// <param name="html"></param>
    301         /// <param name="parentCode"></param>
    302         /// <returns></returns>
    303         private List<Category> Second(string html, string parentCode)
    304         {
    305             List<Category> categoryList = new List<Category>();
    306             HtmlDocument doc = new HtmlDocument();
    307             doc.LoadHtml(html);
    308             string path = "//*[@class='items']/dl";
    309             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
    310             int k = 1;
    311             foreach (HtmlNode node in nodeList)
    312             {
    313                 string code = string.Format("{0}{1}s", parentCode, k.ToString("00"));
    314                 string secondHtml = node.InnerHtml;
    315                 if (string.IsNullOrWhiteSpace(secondHtml)) continue;
    316                 HtmlDocument secondDoc = new HtmlDocument();
    317                 secondDoc.LoadHtml(secondHtml);
    318                 Category category = new Category()
    319                 {
    320                     Id = _Count++,
    321                     State = 0,
    322                     CategoryLevel = 2,
    323                     Code = code,
    324                     ParentCode = parentCode
    325                 };
    326 
    327 
    328                 HtmlNode secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt/a");
    329                 if (secondNode == null)//图书音像
    330                 {
    331                     secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt");
    332                 }
    333                 category.Name = secondNode.InnerText;
    334                 if (secondNode.Attributes["href"] != null)
    335                 {
    336                     category.Url = secondNode.Attributes["href"].Value;
    337                     if (!category.Url.StartsWith("http:"))
    338                     {
    339                         category.Url = string.Concat("http:", category.Url);
    340                     }
    341                 }
    342                 categoryList.Add(category);
    343                 HtmlNode thirdNode = secondDoc.DocumentNode.SelectSingleNode("//dd");
    344                 if (thirdNode == null) continue;
    345                 categoryList.AddRange(this.Third(thirdNode.InnerHtml, code));
    346                 k++;
    347             }
    348             return categoryList;
    349         }
    350 
    351         /// <summary>
    352         /// 在一个二级类下的全部三级类里面进行查找
    353         /// </summary>
    354         /// <param name="html"></param>
    355         /// <param name="parentCode"></param>
    356         /// <returns></returns>
    357         private List<Category> Third(string html, string parentCode)
    358         {
    359             List<Category> categoryList = new List<Category>();
    360             HtmlDocument doc = new HtmlDocument();
    361             doc.LoadHtml(html);
    362             string path = "//a";
    363             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
    364             if (nodeList == null || nodeList.Count == 0) return categoryList;
    365             int k = 1;
    366             foreach (HtmlNode node in nodeList)
    367             {
    368                 string code = string.Format("{0}{1}t", parentCode, k.ToString("00"));
    369                 Category category = new Category()
    370                 {
    371                     Id = _Count++,
    372                     State = 0,
    373                     CategoryLevel = 3,
    374                     Code = code,
    375                     ParentCode = parentCode
    376                 };
    377                 category.Name = node.InnerText;
    378                 category.Url = node.Attributes["href"].Value;
    379                 if (!category.Url.StartsWith("http:"))
    380                 {
    381                     category.Url = string.Concat("http:", category.Url);
    382                 }
    383                 categoryList.Add(category);
    384                 k++;
    385             }
    386             return categoryList;
    387         }
    388     }
    View Code

      6,Model

     1     public class BaseModel
     2     {
     3         public int Id { get; set; }
     4     }
     5 
     6     public class Category:BaseModel
     7     {
     8         public string Code { get; set; }
     9         public string ParentCode { get; set; }
    10         public string Name { get; set; }
    11         public string Url { get; set; }
    12         public int CategoryLevel { get; set; }
    13         public int State { get; set; }
    14     }
    15 
    16     public class Commodity : BaseModel
    17     {
    18         public long ProductId { get; set; }
    19         public int CategoryId { get; set; }
    20         public string Title { get; set; }
    21         public decimal Price { get; set; }
    22         public string Url { get; set; }
    23         public string ImageUrl { get; set; }
    24     }
    25 
    26 
    27     //jQuery5427073([{"id":"J_1707419","p":"5149.00","m":"5499.00"},{"id":"J_1589214","p":"1999.00","m":"2999.00"},{"id":"J_1546310","p":"3999.00","m":"4999.00"},{"id":"J_1510479","p":"2999.00","m":"3569.00"},{"id":"J_1707420","p":"4149.00","m":"4499.00"},{"id":"J_1770620","p":"2099.00","m":"2499.00"},{"id":"J_1258277","p":"2699.00","m":"3299.00"},{"id":"J_1707423","p":"4599.00","m":"4705.00"},{"id":"J_1252778","p":"3099.00","m":"4199.00"},{"id":"J_1553732","p":"3298.00","m":"4598.00"},{"id":"J_1576022","p":"2999.00","m":"3999.00"},{"id":"J_1420120","p":"1999.00","m":"2899.00"},{"id":"J_647948","p":"1299.00","m":"1698.00"},{"id":"J_1044476","p":"1999.00","m":"2999.00"},{"id":"J_1376591","p":"1299.00","m":"1599.00"},{"id":"J_1416294","p":"4599.00","m":"5898.00"},{"id":"J_1455427","p":"1499.00","m":"1999.00"},{"id":"J_1253502","p":"2799.00","m":"3999.00"},{"id":"J_1553624","p":"2998.00","m":"4398.00"},{"id":"J_1301951","p":"2279.00","m":"3999.00"},{"id":"J_1115374","p":"2499.00","m":"4299.00"},{"id":"J_671315","p":"1999.00","m":"2898.00"},{"id":"J_1283945","p":"3099.00","m":"4199.00"},{"id":"J_1283940","p":"2499.00","m":"2999.00"},{"id":"J_1027317","p":"2799.00","m":"5999.00"},{"id":"J_1314962","p":"3699.00","m":"5199.00"},{"id":"J_1565150","p":"4068.00","m":"5727.00"},{"id":"J_1565175","p":"3788.00","m":"5377.00"},{"id":"J_1565182","p":"3938.00","m":"5757.00"},{"id":"J_1209084","p":"3599.00","m":"4999.00"}]);
    28     /// <summary>
    29     /// 为解析json
    30     /// </summary>
    31     public class CommodityPrice
    32     {
    33         public string id { get; set; }
    34         public decimal p { get; set; }
    35         public decimal m { get; set; }
    36     }
    View Code

      7,DataService

      (1)SqlHelper

      1     public class SqlHelper
      2     {
      3         private static Logger logger = new Logger(typeof(SqlHelper));
      4         private static string _ConnStr = ConfigurationManager.ConnectionStrings["mvc5"].ConnectionString;
      5 
      6         /// <summary>
      7         /// 事务执行
      8         /// </summary>
      9         /// <param name="sql"></param>
     10         public static void ExecuteNonQuery(string sql)
     11         {
     12             using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
     13             {
     14                 sqlConn.Open();
     15                 SqlCommand cmd = new SqlCommand(sql, sqlConn);
     16                 cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
     17             }
     18         }
     19 
     20         public static void ExecuteNonQueryWithTrans(string sql)
     21         {
     22             SqlTransaction trans = null;
     23             try
     24             {
     25                 using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
     26                 {
     27                     sqlConn.Open();
     28                     trans = sqlConn.BeginTransaction();
     29                     SqlCommand cmd = new SqlCommand(sql, sqlConn, trans);
     30                     cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
     31                     trans.Commit();
     32                 }
     33             }
     34             catch (Exception ex)
     35             {
     36                 //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex);
     37                 if (trans != null && trans.Connection != null)
     38                     trans.Rollback();
     39                 throw ex;
     40             }
     41             finally
     42             {
     43             }
     44         }
     45 
     46         public static List<T> QueryList<T>(string sql) where T : new()
     47         {
     48             using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
     49             {
     50                 sqlConn.Open();
     51                 SqlCommand cmd = new SqlCommand(sql, sqlConn);
     52                 return TransList<T>(cmd.ExecuteReader());
     53             }
     54         }
     55 
     56         public static void Insert<T>(T model, string tableName) where T : new()
     57         {
     58             string sql = GetInsertSql<T>(model, tableName);
     59             ExecuteNonQuery(sql);
     60         }
     61 
     62         public static void InsertList<T>(List<T> list, string tableName) where T : new()
     63         {
     64             string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName)));
     65             ExecuteNonQuery(sql);
     66         }
     67 
     68         #region Private
     69         private static string GetInsertSql<T>(T model, string tableName)
     70         {
     71             StringBuilder sbSql = new StringBuilder();
     72 
     73             StringBuilder sbFields = new StringBuilder();
     74             StringBuilder sbValues = new StringBuilder();
     75 
     76             Type type = model.GetType();
     77             var properties = type.GetProperties();
     78             foreach (PropertyInfo p in properties)
     79             {
     80                 string name = p.Name;
     81                 if (!name.Equals("id", StringComparison.OrdinalIgnoreCase))
     82                 {
     83                     sbFields.AppendFormat("[{0}],", name);
     84                     string sValue = null;
     85                     object oValue = p.GetValue(model);
     86                     if (oValue != null)
     87                         sValue = oValue.ToString().Replace("'", "");
     88                     sbValues.AppendFormat("'{0}',", sValue);
     89                 }
     90             }
     91             sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(','));
     92             return sbSql.ToString();
     93         }
     94 
     95         private static List<T> TransList<T>(SqlDataReader reader) where T : new()
     96         {
     97             List<T> tList = new List<T>();
     98             Type type = typeof(T);
     99             var properties = type.GetProperties();
    100             if (reader.Read())
    101             {
    102                 do
    103                 {
    104                     T t = new T();
    105                     foreach (PropertyInfo p in properties)
    106                     {
    107                         p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
    108                     }
    109                     tList.Add(t);
    110                 }
    111                 while (reader.Read());
    112             }
    113             return tList;
    114         }
    115 
    116         private static T TransModel<T>(SqlDataReader reader) where T : new()
    117         {
    118             T t = new T();
    119             if (reader.Read())
    120             {
    121                 do
    122                 {
    123                     Type type = typeof(T);
    124                     var properties = type.GetProperties();
    125                     foreach (PropertyInfo p in properties)
    126                     {
    127                         p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
    128                     }
    129                 }
    130                 while (reader.Read());
    131             }
    132             return t;
    133         }
    134         #endregion Private
    135     }
    View Code
      1 namespace Ruanmou.Crawler.DataService
      2 {
      3     /// <summary>
      4     /// 数据库结构初始化
      5     /// 改进下:直接判断表是否存在,而不是等着异常
      6     /// </summary>
      7     public class DBInit
      8     {
      9         private static Logger logger = new Logger(typeof(DBInit));
     10 
     11         /// <summary>
     12         /// 谨慎使用  会全部删除数据库并重新创建!
     13         /// </summary>
     14         public static void InitCommodityTable()
     15         {
     16             #region Delete
     17             try
     18             {
     19                 StringBuilder sb = new StringBuilder();
     20                 for (int i = 1; i < 31; i++)
     21                 {
     22                     sb.AppendFormat("DROP TABLE [dbo].[JD_Commodity_{0}];", i.ToString("000"));
     23                 }
     24                 SqlHelper.ExecuteNonQuery(sb.ToString());
     25             }
     26             catch (Exception ex)
     27             {
     28                 if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。"))
     29                 {
     30                     logger.Warn("初始化数据库InitCommodityTable删除的时候,原表不存在");
     31                 }
     32                 else
     33                 {
     34                     logger.Error("初始化数据库InitCommodityTable失败", ex);
     35                     throw ex;
     36                 }
     37             }
     38             #endregion Delete
     39 
     40             #region Create
     41             try
     42             {
     43                 StringBuilder sb = new StringBuilder();
     44                 for (int i = 1; i < 31; i++)
     45                 {
     46                     sb.AppendFormat(@"CREATE TABLE [dbo].[JD_Commodity_{0}](
     47                                         [Id] [int] IDENTITY(1,1) NOT NULL,
     48                                         [ProductId] [bigint] NULL,
     49                                         [CategoryId] [int] NULL,
     50                                         [Title] [nvarchar](500) NULL,
     51                                         [Price] [decimal](18, 2) NULL,
     52                                         [Url] [varchar](1000) NULL,
     53                                         [ImageUrl] [varchar](1000) NULL,
     54                              CONSTRAINT [PK_JD_Commodity_{0}] PRIMARY KEY CLUSTERED 
     55                             (
     56                                 [Id] ASC
     57                             )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
     58                             ) ON [PRIMARY];", i.ToString("000"));
     59                 }
     60                 SqlHelper.ExecuteNonQuery(sb.ToString());
     61             }
     62             catch (Exception ex)
     63             {
     64                 logger.Error("InitCommodityTable创建异常", ex);
     65                 throw ex;
     66             }
     67             #endregion Create
     68         }
     69 
     70         /// <summary>
     71         /// 谨慎使用  会全部删除数据库并重新创建!
     72         /// </summary>
     73         public static void InitCategoryTable()
     74         {
     75             #region Delete
     76             try
     77             {
     78                 StringBuilder sb = new StringBuilder();
     79                 sb.AppendFormat("DROP TABLE [dbo].[Category];");
     80                 SqlHelper.ExecuteNonQuery(sb.ToString());
     81             }
     82             catch (Exception ex)
     83             {
     84                 if (ex.Message.Equals("无法对 表 'dbo.Category' 执行 删除,因为它不存在,或者您没有所需的权限。"))
     85                 {
     86                     logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在");
     87                 }
     88                 else
     89                 {
     90                     logger.Error("初始化数据库InitCategoryTable失败", ex);
     91                     throw ex;
     92                 }
     93             }
     94             #endregion Delete
     95 
     96             #region Create
     97             try
     98             {
     99                 StringBuilder sb = new StringBuilder();
    100                 sb.AppendFormat(@"CREATE TABLE [dbo].[Category](
    101                                         [Id] [int] IDENTITY(1,1) NOT NULL,
    102                                         [Code] [varchar](100) NULL,
    103                                         [ParentCode] [varchar](100) NULL,
    104                                         [CategoryLevel] [int] NULL,
    105                                         [Name] [nvarchar](50) NULL,
    106                                         [Url] [varchar](1000) NULL,
    107                                         [State] [int] NULL,
    108                                       CONSTRAINT [PK_Category] PRIMARY KEY CLUSTERED 
    109                                      (
    110                                          [Id] ASC
    111                                      )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
    112                                      ) ON [PRIMARY];");
    113 
    114                 SqlHelper.ExecuteNonQuery(sb.ToString());
    115             }
    116             catch (Exception ex)
    117             {
    118                 logger.Error("初始化数据库InitCategoryTable 创建失败", ex);
    119                 throw ex;
    120             }
    121             #endregion Create
    122 
    123         }
    124     }
    125 }
    View Code
      1     public interface IRepository<T> where T : class//, new()
      2     {
      3         void Save(T entity);
      4         void SaveList(List<T> entity);
      5     }
      6 
      7     public class CommodityRepository //: IRepository<Commodity>
      8     {
      9         private Logger logger = new Logger(typeof(CommodityRepository));
     10 
     11         public void SaveList(List<Commodity> commodityList)
     12         {
     13             if (commodityList == null || commodityList.Count == 0) return;
     14             IEnumerable<IGrouping<string, Commodity>> group = commodityList.GroupBy<Commodity, string>(c => GetTableName(c));
     15 
     16             foreach (var data in group)
     17             {
     18                 SqlHelper.InsertList<Commodity>(data.ToList(), data.Key);
     19             }
     20         }
     21 
     22         private string GetTableName(Commodity commodity)
     23         {
     24             return string.Format("JD_Commodity_{0}", (commodity.ProductId % 30 + 1).ToString("000"));
     25         }
     26 
     27         /// <summary>
     28         /// 保存文本记录
     29         /// </summary>
     30         /// <param name="commodityList"></param>
     31         /// <param name="category"></param>
     32         /// <param name="page"></param>
     33         public void SaveList(List<Commodity> commodityList, Category category, int page)
     34         {
     35             StreamWriter sw = null;
     36             try
     37             {
     38                 string recordFileName = string.Format($"{category.CategoryLevel}/{category.ParentCode}/{category.Id}/{page}.txt");
     39                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
     40                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
     41                 {
     42                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
     43                     sw = File.CreateText(totolPath);
     44                 }
     45                 else
     46                 {
     47                     sw = File.AppendText(totolPath);
     48                 }
     49                 sw.WriteLine(JsonConvert.SerializeObject(commodityList));
     50             }
     51             catch (Exception e)
     52             {
     53                 logger.Error("CommodityRepository.SaveList出现异常", e);
     54             }
     55             finally
     56             {
     57                 if (sw != null)
     58                 {
     59                     sw.Flush();
     60                     sw.Close();
     61                     sw.Dispose();
     62                 }
     63             }
     64         }
     65     }
     66 
     67 
     68     public class CategoryRepository //: IRepository<Commodity>
     69     {
     70         private Logger logger = new Logger(typeof(CategoryRepository));
     71 
     72         public void Save(List<Category> categoryList)
     73         {
     74             SqlHelper.InsertList<Category>(categoryList, "Category");
     75             new Action<List<Category>>(SaveList).BeginInvoke(categoryList, null, null);
     76         }
     77 
     78         /// <summary>
     79         /// 根据Level获取类别列表
     80         /// </summary>
     81         /// <param name="level"></param>
     82         /// <returns></returns>
     83         public List<Category> QueryListByLevel(int level)
     84         {
     85             string sql = string.Format("SELECT * FROM category WHERE categorylevel={0};", level);
     86             return SqlHelper.QueryList<Category>(sql);
     87         }
     88 
     89 
     90         /// <summary>
     91         /// 存文本记录的
     92         /// </summary>
     93         /// <param name="categoryList"></param>
     94         public void SaveList(List<Category> categoryList)
     95         {
     96             StreamWriter sw = null;
     97             try
     98             {
     99                 string recordFileName = string.Format("{0}_Category.txt", DateTime.Now.ToString("yyyyMMddHHmmss"));
    100                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
    101                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
    102                 {
    103                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
    104                     sw = File.CreateText(totolPath);
    105                 }
    106                 else
    107                 {
    108                     sw = File.AppendText(totolPath);
    109                 }
    110 
    111                 sw.WriteLine(JsonConvert.SerializeObject(categoryList));
    112             }
    113             catch (Exception e)
    114             {
    115                 logger.Error("CategoryRepository.SaveList出现异常", e);
    116             }
    117             finally
    118             {
    119                 if (sw != null)
    120                 {
    121                     sw.Flush();
    122                     sw.Close();
    123                     sw.Dispose();
    124                 }
    125             }
    126         }
    127     }
    128 
    129     public class WarnRepository //: IRepository<Commodity>
    130     {
    131         private Logger logger = new Logger(typeof(WarnRepository));
    132         public void SaveWarn(Category category, string msg)
    133         {
    134             StreamWriter sw = null;
    135             try
    136             {
    137                 string recordFileName = string.Format("warn/{0}/{1}/{2}.txt", category.CategoryLevel, category.ParentCode, category.Id);
    138                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
    139                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
    140                 {
    141                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
    142                     sw = File.CreateText(totolPath);
    143                 }
    144                 else
    145                 {
    146                     sw = File.AppendText(totolPath);
    147                 }
    148                 sw.WriteLine(msg);
    149                 sw.WriteLine(JsonConvert.SerializeObject(JsonConvert.SerializeObject(category)));
    150             }
    151             catch (Exception e)
    152             {
    153                 logger.Error("SaveWarn出现异常", e);
    154             }
    155             finally
    156             {
    157                 if (sw != null)
    158                 {
    159                     sw.Flush();
    160                     sw.Close();
    161                     sw.Dispose();
    162                 }
    163             }
    164         }
    165     }
    View Code

       京东爬虫使用说明:

    • 配置app.config的DataPath(这个是文本数据存储的地址);
    • 创建一个sqlerver数据库,配置app.config的数据库连接;
    • 运行项目,可以直接vs运行,或者使用CrawlerinDebug下面的Crawler.exe;
    • 数据库表结构是自动创建的,控制台需要输入Y才能开始初始化数据库结构,然后进行数据抓取;
    • 看看控制台有无提示异常,看看数据库的数据即可。

    类别大概是1300+ 商品是800W+

  • 相关阅读:
    Python 集合 深浅copy
    python基础(基础数据类型)
    python基础一
    Asp.net获取网站绝对路径的几种方法
    Ajax请求被缓存的几种处理方式
    说说字符编码
    linux学习记录
    mysql基础
    【Android开发入门】关于ListView中按钮监听器设置的解决方案
    线程同步小结
  • 原文地址:https://www.cnblogs.com/shangec/p/9900030.html
Copyright © 2020-2023  润新知