using HtmlAgilityPack; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System.Collections; using System.IO; namespace EasySpider { public class ReadZhihu { public static void FormatDocument(string document) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(document); ////*[@id="FreeDefinePlaceholderControl1"] var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]"); var liNodes = singleNode.SelectNodes(".//a[@name="topic"]"); //foreach (var item in liNodes) //{ // string name = item.InnerText; // string url = item.GetAttributeValue("href", string.Empty); // WriteData(url, name); //} //get structure var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name="topic"]"); foreach (var item in level4) { //string l4Url = item.GetAttributeValue("href", string.Empty); //string l4Text = item.InnerText; //var l3Node = item.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"); //string l3Url = l3Node.GetAttributeValue("href", string.Empty); //string l3Text = l3Node.InnerText; //var l2Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"); //string l2Text = l2Node.InnerText; //string l2Url = l2Node.GetAttributeValue("href", string.Empty); //var l1Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"); //string l1Text = l1Node.InnerText; //string l1Url = l1Node.GetAttributeValue("href", string.Empty); JArray structure = new JArray(); //structure.Add(GenereateObject(l1Url, l1Text)); //structure.Add(GenereateObject(l2Url, l2Text)); //structure.Add(GenereateObject(l3Url, l3Text)); //structure.Add(GenereateObject(l4Url, l4Text)); Stack s = new Stack(); GetParentNode(item, ref s); int count = s.Count; while(count != 0) { structure.Add(s.Pop()); count--; } WriteData(structure, @"D:学科Struct.json"); } } public static void GetParentNode(HtmlNode node, ref Stack s) { string url = node.GetAttributeValue("href", string.Empty); string topic = node.InnerHtml; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return; if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return; s.Push(GenereateObject(url, topic)); if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null) { GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s); } else return; } public static JObject GenereateObject(string url, string topic) { JObject obj = new JObject(); obj.Add("topic", topic); obj.Add("url", url); return obj; } public static void WriteData(object obj, string fileName) { //JObject QNA = (JObject)question; //string qus = QNA["Question"].ToString(); //string ans = QNA["QuesDetail"].ToString(); //string anstemp = string.Empty; //foreach (var item in ans.Split(new char[] { ' ', ' ' })) //{ // if (string.IsNullOrEmpty(item) || item.Contains("本页面内容供您参考")) // continue; // anstemp += item.Trim() + " "; //} //JObject obj = new JObject(); //obj.Add("Question", qus); //obj.Add("Answer", anstemp.Trim()); //ICBCQNA QNA = (ICBCQNA)question; string json = JsonConvert.SerializeObject(obj); string QnaPath = fileName;//文件存放路径,保证文件存在 if (!File.Exists(QnaPath)) { File.Create(QnaPath); } using (StreamWriter sw = new StreamWriter(QnaPath, true)) { sw.WriteLine(json); } } } }
using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace FormatDocument { class Program { static void Main(string[] args) { //read file string path = @"D:学科Struct.json"; StreamReader sr = new StreamReader(path, Encoding.UTF8); String line; int i = 0; int j = 0; while ((line = sr.ReadLine()) != null) { Console.WriteLine("------------------readline: {0}------------------",++i); WriteData(line, @"D: opic.json",j); } } public static void WriteData(string row, string fileName, int j) { Console.WriteLine("-----------write data begin -----------"); string QnaPath = fileName;//文件存放路径,保证文件存在 JArray item = JArray.Parse(row); JArray outArray = new JArray(); if (item.Count >= 4) { for (int i = 0; i < 4; i++) outArray.Add(item[i]); string json = JsonConvert.SerializeObject(outArray); string temp = File.ReadAllText(fileName); if (!temp.Contains(json)) { using (StreamWriter sw = File.AppendText(fileName)) { Console.WriteLine("-----------insert {0} row -----------",++j); sw.WriteLine(json); } } } } } }