using AnfleCrawler.Common; using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer { public class ManualAnalyzer : AnalyzerBase { private ConcurrentQueue<string[]> _dict = new ConcurrentQueue<string[]>(); public override void Init(PageCrawler crawler) { crawler.Lander.Idle += Lander_Idle; base.Init(crawler); var url = new Uri("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq=20140601&lx=w6&r1=20140830"); var dom = Crawler.Lander.GetDocument(new PageContentHandler() { Url = url }); foreach (var node in QueryNodes(dom.DocumentNode, "#qrq option")) { string val = node.GetAttributeValue("value", string.Empty); Crawler.PushUrl(new Uri(string.Format("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq={0}&lx=w6&r1=20140830", val)), 1); } } void Lander_Idle(object sender, EventArgs e) { Crawler.OutWrite("Start step2..."); App.LogInfo("Start step2..."); using (var writer = new System.IO.StreamWriter(@"D:outdict.txt", false, Encoding.UTF8)) { foreach (var set in _dict) { writer.WriteLine(string.Join(",", set)); } } } protected override void AnalyzeInternal(PageLandEntity current) { Crawler.OutWrite("*Start step1..."); var lander = Crawler.Lander; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 1: { var query = System.Web.HttpUtility.ParseQueryString(current.Url.Query); var dt = DateTime.ParseExact(query["rq"], "yyyyMMdd", null); var dom = lander.GetDocument(pHandler); var checkNode = QueryNode(dom.DocumentNode, "#str1"); if (string.IsNullOrWhiteSpace(checkNode.InnerText)) { return; } checkNode.InnerHtml = checkNode.InnerHtml.Replace("<tr", "</tr><tr").Substring(5); App.LogInfo("WTF CN:{0}", checkNode.InnerHtml); var set = QueryNodes(checkNode, "tr"); foreach (var node in set) { var x = new List<string>(); x.Add(dt.ToString("yyyy-MM-dd")); x.AddRange(QueryTexts(node, "td")); _dict.Enqueue(x.ToArray()); } _dict.Enqueue(new string[] { Environment.NewLine }); Crawler.OutWrite("#Stop step1 {0} {1}", dt.ToShortDateString(), set.Count()); } break; } } } }