• Mytophome Deal


    using AnfleCrawler.Common;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        internal class Mytophome : AnalyzerBase
        {
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 0:
                        {
                            var dom = lander.GetDocument(pHandler);
                            var nextNode = QueryNode(dom.DocumentNode, "nobr").ParentNode;
                            nextNode.SetAttributeValue("id", PagingHack);
                            DoPerPaging(current, dom.DocumentNode, string.Format("#{0}", PagingHack));
    
                            foreach (var node in QueryNodes(dom.DocumentNode, ".deD_ctt li"))
                            {
                                var Nset = QueryNodes(node, "span").ToArray();
                                var hUrl = GetHref(QueryNode(Nset[1], "a"), current.Url);
                                var query = System.Web.HttpUtility.ParseQueryString(hUrl.Query);
                                string shid = query["estateId"];
                                hUrl = new Uri(string.Format("http://{0}/wiki/{1}/detail.html", hUrl.Authority, shid));
                                Guid housesID;
                                try
                                {
                                    CheckHouses(hUrl, out housesID);
                                }
                                catch (HtmlNodeMissingException ex)
                                {
                                    App.LogError(ex, "OrgUrl={0} HousesUrl={1}", shid, hUrl);
                                    continue;
                                }
    
                                var vals = Nset.Select(p => p.InnerText.HtmlTrim()).ToArray();
                                DateTime? transactionDate = null;
                                DateTime dump;
                                if (DateTime.TryParse(vals.Last(), out dump))
                                {
                                    transactionDate = dump;
                                }
                                if (vals.Length == 6)
                                {
                                    Repository.SaveHouselisting(new HouselistingEntity()
                                    {
                                        HousesID = housesID,
                                        TransactionDate = transactionDate,
                                        BuildingName = vals[2],
                                        Area = string.Format("{0}平方", vals[3]),
                                        SoldPriceOrRent = string.Format("{0}万", vals[4]),
                                        UnitPriceOrLease = string.Format("{0}元/平方", vals[5]),
                                    });
                                }
                                else
                                {
                                    Repository.SaveHouselisting(new HouselistingEntity()
                                    {
                                        HousesID = housesID,
                                        TransactionDate = transactionDate,
                                        Area = string.Format("{0}平方", vals[2]),
                                        SoldPriceOrRent = string.Format("{0}万", vals[3]),
                                        UnitPriceOrLease = string.Format("{0}元/平方", vals[4]),
                                    });
                                }
                                Crawler.OutWrite("保存小区出售记录 {0}", housesID);
                            }
                        }
                        break;
                }
            }
    
            private void CheckHouses(Uri housesUrl, out Guid housesID)
            {
                var pHandler = CreateContentHandler(new PageLandEntity()
                {
                    Url = housesUrl,
                    Depth = DataDepth.Houses
                });
                pHandler.AjaxBlocks.Add(HACK);
                var dom = Crawler.Lander.GetDocument(pHandler);
                var attrs = new AttributeFiller();
    
                attrs.Append(QueryTexts(dom.DocumentNode, ".xxjs_rbar_ct li"));
    
                housesID = GenHashKey(housesUrl.OriginalString);
                var bo = Crawler.Repository.LoadHouses(housesID);
                if (!string.IsNullOrEmpty(bo.SiteID))
                {
                    return;
                }
                bo.SiteID = "Mytophome.com";
                bo.PageUrl = housesUrl.OriginalString;
                bo.CityName = Crawler.Config.CityName;
                attrs.FillEntity(bo, new Dictionary<string, string>()
                {
                    {"楼盘名称", "小区名称"},
                    {"楼盘地址", "小区地址"},
                    {"发展商", "开发商"},
                    {"物管公司", "物业公司"},
                    {"物管电话", "物业办公电话"},
                });
                MapMark(bo);
                Crawler.Repository.Save(bo);
                Crawler.OutWrite("保存楼盘 {0}", bo.小区名称);
            }
        }
    }
  • 相关阅读:
    c# -- 实现浏览功能(备忘)
    自己动手写中文分词解析器完整教程,并对出现的问题进行探讨和解决(附完整c#代码和相关dll文件、txt文件下载)
    爬虫技术 -- 进阶学习(九)使用HtmlAgilityPack获取页面链接(附c#代码及插件下载)
    爬虫技术 -- 进阶学习(八)模拟简单浏览器(附c#代码)
    爬虫技术 -- 进阶学习(七)简单爬虫抓取示例(附c#代码)
    c# -- 介绍File.AppendAllText 方法
    c# -- 解决FromsAuthentication上下文不存在
    c# -- Form1_Load()不被执行的三个解决方法
    爬虫技术 -- 基础学习(六)解析相对地址
    爬虫技术 -- 基础学习(五)解决页面编码识别(附c#代码)
  • 原文地址:https://www.cnblogs.com/Googler/p/4272703.html
Copyright © 2020-2023  润新知