• 处理父节点 子节点


    using HtmlAgilityPack;
    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using System;
    using System.Collections;
    using System.Collections.Generic;
    using System.IO;
    
    namespace ReadZhihuThread
    {
        public class ReadZhihu
        {
            public static void FormatDocument(string document, string outPath)
            {
                Console.WriteLine("Read document start");
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(document);
                ////*[@id="FreeDefinePlaceholderControl1"]
                var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]");
                var liNodes = singleNode.SelectNodes(".//a[@name="topic"]");
    
    
                JArray structure = new JArray();
                //get structure
                var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name="topic"]");
                List<JArray> list = new List<JArray>();
                int i = 0;
                foreach (var item in level4)
                {
    
                    Console.WriteLine("select nodes: {0}",i++);
                    Stack s = new Stack();
    
                    GetParentNode(item, ref s);
                    int count = s.Count;
                    while (count != 0)
                    {
                        structure.Add(s.Pop());
                        count--;
                    }
                    list.Add(structure);
                    if (list.Count > 100)
                    {
                        WriteData(ref list, outPath);
                    }
                }
                if (list.Count != 0)
                {
                    WriteData(ref list, outPath);
                }
    
            }
            public static void GetParentNode(HtmlNode node, ref Stack s)
            {
                string url = node.GetAttributeValue("href", string.Empty);
                string topic = node.InnerHtml;
    
                if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
                if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return;
                s.Push(GenereateObject(url, topic));
    
                if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null)
                {
                    GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s);
                }
            }
            public static JObject GenereateObject(string url, string topic)
            {
                JObject obj = new JObject();
                obj.Add("topic", topic);
                obj.Add("url", url);
                return obj;
            }
    
            public static void WriteData(ref List<JArray> list, string fileName)
            {
                Console.WriteLine("write data");
    
                string QnaPath = fileName;//文件存放路径,保证文件存在
    
                if (!File.Exists(QnaPath))
                {
                    File.Create(QnaPath);
    
                }
    
                foreach (var item in list)
                {
                    JArray outArray = new JArray();
                    if (item.Count >= 4)
                    {
                        for (int i = 0; i < 4; i++)
                            outArray.Add(item[i]);
                        string json = JsonConvert.SerializeObject(outArray);
                        string temp = File.ReadAllText(fileName);
                        if (!temp.Contains(json))
                        {
                            using (StreamWriter sw = File.AppendText(fileName))
                            {
                                sw.WriteLine(json);
                            }
                        }
    
                    }
                }
                list.Clear();
            }
    
            public static void ReadSubTopic(string document, string outputPath)
            {
                Console.WriteLine("read subject topic start");
    
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(document);
                //*[@id="FreeDefinePlaceholderControl1"]
                //zm-topic-manage-item-inner
                var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]");
                var liNodes = singleNode.SelectNodes(".//a[@name="topic"]");
    
                var parentChildNodes = htmlDocument.DocumentNode.SelectNodes(".//div[@class="zm-topic-manage-item-inner"]");
                if (parentChildNodes.Count != 2)
                {
                    Console.WriteLine("this code has a bug");
                }
                else
                {
                    var parentNode = parentChildNodes[0];
    
                    List<JArray> listParent = new List<JArray>();
                    //get parent
                    int parentLevelCount = 0;
                    var parentNodes = parentNode.SelectNodes(".//div[@class="zm-topic-tree"]/ul");
                    foreach (var item in parentNodes)
                    {
                        Console.WriteLine("deal with parent {0} level data", ++parentLevelCount);
                        JArray array = new JArray();
                        GetChildNode(item.FirstChild.FirstChild, ref array);
                        listParent.Add(array);
                    }
    
                    // debug parent nodes
                    WriteData(listParent, @"D:parentNode.json");
    
                    var childNode = parentChildNodes[1];
    
                    //get all child nodes
                    Console.WriteLine("get all child nodes");
                    var nodes = childNode.SelectNodes(".//li/a[@name="topic"]");
    
                    //get child topic structure
                    int i = 0;
                    List<string> childlist = new List<string>();
                    foreach (var item in nodes)
                    {
                        Stack s = new Stack();
    
                        Console.WriteLine("deal with {0} level", i++);
                        GetParentNode(item, ref s);
                        // genereate child structure
                        Console.WriteLine("generate jarry {0} level", i);
                        childlist.Add(JsonConvert.SerializeObject(GenerateJArry(s)));
                        
                        
                    }
                    //distinct list
                    Console.Write("**********************distinct list");
                    string[] childListCp = new string[childlist.Count];
                    childlist.CopyTo(childListCp);
    
                    for (int j = childlist.Count - 1; j > -1; j--)
                    {
                        Console.WriteLine("distinct {0} level data", j);
                        for (int k = childListCp.Length - 1; k > -1; k--)
                        {
                            if (j == k)
                                continue;
                            string temp = childlist[j];
                            temp = temp.TrimStart('[').TrimEnd(']');
                            if (childListCp[k].Contains(temp))
                            {
                                childlist.RemoveAt(j);
                                break;
                            }
                        }
                    }
    
                    //write data
    
                    List<JArray> listAll = new List<JArray>();
                    // join parent node data
                    foreach (JArray item in listParent)
                    {
                        foreach (var childRecord in childlist)
                        {
                            var arr = JArray.Parse(childRecord);
                            
                            var tempArray = item.DeepClone();
    
                            foreach (JObject element in arr)
                            {
                                ((JArray)tempArray).Add(element);
    
                            }
                            Console.WriteLine("insert one record:{0}", JsonConvert.SerializeObject(tempArray));
                            listAll.Add((JArray)tempArray);
                        }
                        
                    }
                    WriteData(listAll, outputPath);
                }
    
            }
    
            public static void WriteData(List<string> list, string outputPath)
            {
                foreach (var item in list)
                {
                    using (StreamWriter sw = File.AppendText(outputPath))
                    {
                       
                        sw.WriteLine(item);
                    }
                }
            }
    
            public static void WriteData(List<JArray> list, string outputPath)
            {
                foreach (var item in list)
                {
                    using (StreamWriter sw = File.AppendText(outputPath))
                    {
    
                        sw.WriteLine(JsonConvert.SerializeObject(item));
                    }
                }
            }
            public static JArray GenerateJArry(Stack s)
            {
                JArray array = new JArray();
                int stackCount = s.Count;
                while (stackCount != 0)
                {
                    array.Add(s.Pop());
                    stackCount--;
                }
                return array;
            }
    
            public static void GetChildNode(HtmlNode node, ref JArray array)
            {
                string url = node.GetAttributeValue("href", string.Empty);
                string topic = node.InnerHtml;
    
                if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
                array.Add(GenereateObject(url, topic));
    
                if (node.NextSibling !=null && node.NextSibling.NextSibling != null && node.NextSibling.NextSibling.FirstChild != null && node.NextSibling.NextSibling.FirstChild.FirstChild != null )
                {
                    GetChildNode(node.NextSibling.NextSibling.FirstChild.FirstChild, ref array);
                }
            }
        }
    }

    js

    // ==UserScript==
    // @name        知乎_话题_获取完整话题结构
    // @namespace   zhihu
    // @include     https://www.zhihu.com/topic/*/organize/entire
    // @version     3
    // @grant       none
    // @description 知乎_话题_每隔1毫秒点击“加载更多”和“显示子话题”
    // ==/UserScript==
    
    var count = 0;
    function clickitem() {
      
        var items = document.getElementsByName("load");
        var i;
        var itemSel = 0;
        for (i = 0; i < items.length; i++) {
            if (itemSel === 0) {
                itemSel = items[i];
                continue;
            }
            if (itemSel.offsetLeft > items[i].offsetLeft) {
                itemSel = items[i];
                continue;
            } else if (itemSel.offsetLeft == items[i].offsetLeft){
                if (itemSel.text == "显示子话题" && items[i].text == "加载更多") {
                    itemSel = items[i];
                }
            }
        }
        count++;
        itemSel.click();
        
    }
    var sss=setInterval(clickitem, 2000);
      function Start()
      {
            if(sss != null && sss!= undefined)
            {
                window.clearInterval(s1)
            }
            else
            s1=setInterval(clickitem, 1);    
      }
      
     var sta = setInterval(Start, 20000);
     
     
     function clickitem() {
      
        var items = document.getElementsByName("load");
        items[0].click();
        
    }
    
    var sta = setInterval(clickitem, 1);
    I'm fine, it's ok
  • 相关阅读:
    基于小程序开发的藏书馆
    picker(级联)组件及组件封装经验
    秒杀组件开发-可实现多种倒计时功能
    async/await 与 generator、co 的对比
    nodejs项目总结
    小程序开发小结-线下服务器域名部署等
    性能提速:debounce(防抖)、throttle(节流/限频)
    vuex数据管理-数据模块化
    vue 项目其他规范
    vue路由管理-保留滚动位置功能、按需加载模块名自定义
  • 原文地址:https://www.cnblogs.com/skywss27/p/10009587.html
Copyright © 2020-2023  润新知