自己怎么了,改变自己能改变的,比以前好就可以了,一个小说的下载
命名空间引入(需要导入包HtmlAgilityPack):
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Net.Http; using System.Text; using System.Threading; using System.Threading.Tasks; using HtmlAgilityPack;
抓取内容:
static string m_strTextPath = "D:\333.txt"; static void Main(string[] args) { try { LoadNovel(); } catch (Exception e) { Console.WriteLine(e.ToString()); throw; } Console.ReadKey(); } /// <summary> /// 下载导航页面 /// </summary> async static void LoadNovel() { string l_strURL = "http://www.vodtw.com/html/book/34/34009/"; WebClient wc = new WebClient(); wc.BaseAddress = l_strURL; wc.Encoding = Encoding.GetEncoding("gb2312"); HtmlDocument doc = new HtmlDocument(); string html = wc.DownloadString("index.html"); doc.LoadHtml(html); HtmlNode navNode1 = doc.DocumentNode.SelectSingleNode("/html/body/div[7]/div[5]/dl/dd/ul"); HtmlNodeCollection CNodes1 = navNode1.SelectNodes("child::li"); //解析class 等于下面所有的a标签的URL 存入一个List集合中 List<string> list = new List<string>(); foreach (HtmlNode item in CNodes1) { list.Add(item.FirstChild.Attributes["href"].Value); } //从集合中读取URL 追加到D盘 222.txt中 foreach (string l_str in list) { string html1 = wc.DownloadString(l_str); //过于频繁的网络请求有问题 Thread.Sleep(100); doc.LoadHtml(html1); //章节名称 也可以不抓取 再内容里面有 //HtmlNode CaptureName = doc.DocumentNode.SelectSingleNode("html/body/div[3]/div[1]/b"); //File.AppendAllText("D:\333.txt", CaptureName.InnerText + " "); //加载内容 HtmlNode navNode2 = doc.DocumentNode.SelectSingleNode("html/body/div[4]/div[4]"); HtmlNodeCollection CNodes2 = navNode2.SelectNodes("child::p"); foreach (HtmlNode ddd in CNodes2) { if (ddd.InnerText == "===========================") break; File.AppendAllText(m_strTextPath, ddd.InnerText+" "); } } }
后续需要校验抓取数据的完整性,里面是否有杂乱的东西