using HtmlAgilityPack; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System; using System.Collections; using System.Collections.Generic; using System.IO; namespace ReadZhihuThread { public class ReadZhihu { public static void FormatDocument(string document, string outPath) { Console.WriteLine("Read document start"); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(document); ////*[@id="FreeDefinePlaceholderControl1"] var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]"); var liNodes = singleNode.SelectNodes(".//a[@name="topic"]"); JArray structure = new JArray(); //get structure var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name="topic"]"); List<JArray> list = new List<JArray>(); int i = 0; foreach (var item in level4) { Console.WriteLine("select nodes: {0}",i++); Stack s = new Stack(); GetParentNode(item, ref s); int count = s.Count; while (count != 0) { structure.Add(s.Pop()); count--; } list.Add(structure); if (list.Count > 100) { WriteData(ref list, outPath); } } if (list.Count != 0) { WriteData(ref list, outPath); } } public static void GetParentNode(HtmlNode node, ref Stack s) { string url = node.GetAttributeValue("href", string.Empty); string topic = node.InnerHtml; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return; if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return; s.Push(GenereateObject(url, topic)); if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null) { GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s); } } public static JObject GenereateObject(string url, string topic) { JObject obj = new JObject(); obj.Add("topic", topic); obj.Add("url", url); return obj; } public static void WriteData(ref List<JArray> list, string fileName) { Console.WriteLine("write data"); string QnaPath = fileName;//文件存放路径,保证文件存在 if (!File.Exists(QnaPath)) { File.Create(QnaPath); } foreach (var item in list) { JArray outArray = new JArray(); if (item.Count >= 4) { for (int i = 0; i < 4; i++) outArray.Add(item[i]); string json = JsonConvert.SerializeObject(outArray); string temp = File.ReadAllText(fileName); if (!temp.Contains(json)) { using (StreamWriter sw = File.AppendText(fileName)) { sw.WriteLine(json); } } } } list.Clear(); } public static void ReadSubTopic(string document, string outputPath) { Console.WriteLine("read subject topic start"); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(document); //*[@id="FreeDefinePlaceholderControl1"] //zm-topic-manage-item-inner var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]"); var liNodes = singleNode.SelectNodes(".//a[@name="topic"]"); var parentChildNodes = htmlDocument.DocumentNode.SelectNodes(".//div[@class="zm-topic-manage-item-inner"]"); if (parentChildNodes.Count != 2) { Console.WriteLine("this code has a bug"); } else { var parentNode = parentChildNodes[0]; List<JArray> listParent = new List<JArray>(); //get parent int parentLevelCount = 0; var parentNodes = parentNode.SelectNodes(".//div[@class="zm-topic-tree"]/ul"); foreach (var item in parentNodes) { Console.WriteLine("deal with parent {0} level data", ++parentLevelCount); JArray array = new JArray(); GetChildNode(item.FirstChild.FirstChild, ref array); listParent.Add(array); } // debug parent nodes WriteData(listParent, @"D:parentNode.json"); var childNode = parentChildNodes[1]; //get all child nodes Console.WriteLine("get all child nodes"); var nodes = childNode.SelectNodes(".//li/a[@name="topic"]"); //get child topic structure int i = 0; List<string> childlist = new List<string>(); foreach (var item in nodes) { Stack s = new Stack(); Console.WriteLine("deal with {0} level", i++); GetParentNode(item, ref s); // genereate child structure Console.WriteLine("generate jarry {0} level", i); childlist.Add(JsonConvert.SerializeObject(GenerateJArry(s))); } //distinct list Console.Write("**********************distinct list"); string[] childListCp = new string[childlist.Count]; childlist.CopyTo(childListCp); for (int j = childlist.Count - 1; j > -1; j--) { Console.WriteLine("distinct {0} level data", j); for (int k = childListCp.Length - 1; k > -1; k--) { if (j == k) continue; string temp = childlist[j]; temp = temp.TrimStart('[').TrimEnd(']'); if (childListCp[k].Contains(temp)) { childlist.RemoveAt(j); break; } } } //write data List<JArray> listAll = new List<JArray>(); // join parent node data foreach (JArray item in listParent) { foreach (var childRecord in childlist) { var arr = JArray.Parse(childRecord); var tempArray = item.DeepClone(); foreach (JObject element in arr) { ((JArray)tempArray).Add(element); } Console.WriteLine("insert one record:{0}", JsonConvert.SerializeObject(tempArray)); listAll.Add((JArray)tempArray); } } WriteData(listAll, outputPath); } } public static void WriteData(List<string> list, string outputPath) { foreach (var item in list) { using (StreamWriter sw = File.AppendText(outputPath)) { sw.WriteLine(item); } } } public static void WriteData(List<JArray> list, string outputPath) { foreach (var item in list) { using (StreamWriter sw = File.AppendText(outputPath)) { sw.WriteLine(JsonConvert.SerializeObject(item)); } } } public static JArray GenerateJArry(Stack s) { JArray array = new JArray(); int stackCount = s.Count; while (stackCount != 0) { array.Add(s.Pop()); stackCount--; } return array; } public static void GetChildNode(HtmlNode node, ref JArray array) { string url = node.GetAttributeValue("href", string.Empty); string topic = node.InnerHtml; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return; array.Add(GenereateObject(url, topic)); if (node.NextSibling !=null && node.NextSibling.NextSibling != null && node.NextSibling.NextSibling.FirstChild != null && node.NextSibling.NextSibling.FirstChild.FirstChild != null ) { GetChildNode(node.NextSibling.NextSibling.FirstChild.FirstChild, ref array); } } } }
js
// ==UserScript== // @name 知乎_话题_获取完整话题结构 // @namespace zhihu // @include https://www.zhihu.com/topic/*/organize/entire // @version 3 // @grant none // @description 知乎_话题_每隔1毫秒点击“加载更多”和“显示子话题” // ==/UserScript== var count = 0; function clickitem() { var items = document.getElementsByName("load"); var i; var itemSel = 0; for (i = 0; i < items.length; i++) { if (itemSel === 0) { itemSel = items[i]; continue; } if (itemSel.offsetLeft > items[i].offsetLeft) { itemSel = items[i]; continue; } else if (itemSel.offsetLeft == items[i].offsetLeft){ if (itemSel.text == "显示子话题" && items[i].text == "加载更多") { itemSel = items[i]; } } } count++; itemSel.click(); } var sss=setInterval(clickitem, 2000); function Start() { if(sss != null && sss!= undefined) { window.clearInterval(s1) } else s1=setInterval(clickitem, 1); } var sta = setInterval(Start, 20000); function clickitem() { var items = document.getElementsByName("load"); items[0].click(); } var sta = setInterval(clickitem, 1);