x
看到程序猿爬虫的故事...一个无聊的周末...也想用Jumony爬点书,,,囤起来...仓鼠症...
using Ivony.Html; using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Windows.Forms; namespace BookGet { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { string host = "https://m.xxx.net/"; string baseUrl = "https://m.xxx.net/wapsort/"; var nextUrl = "11_1.html";// "171271.html"; string url = ""; //所有书籍路径... Dictionary<string, string> bookInfoDic = new Dictionary<string, string>(); string bookName = string.Empty; string bookUrl = string.Empty; #region 获取所有的页→获取此页中的所有书籍字典 int testI = 0; //循环所有页... nextUrl = baseUrl + nextUrl; while (nextUrl != "" && testI < 20) { testI++; try { url = nextUrl; var doc = new Ivony.Html.Parser.JumonyParser().LoadDocument(url); //此页中的所有书籍... IEnumerable<IHtmlElement> bookList = doc.Find("#nr_body div div.common-bookele h3 a"); foreach (var bookItem in bookList) { bookName = bookItem.InnerText(); bookUrl = bookItem.Attribute("href").Value(); if (!bookInfoDic.ContainsKey(bookName)) { //if (bookName == "好想宠坏你") { bookInfoDic.Add(bookName, bookUrl); } } } var domNext = doc.FindFirst("#nr_body div#page a.next"); nextUrl = domNext.Attribute("href").Value(); if (domNext.Attribute("class").Value() == "prev none") { nextUrl = ""; } } catch { Console.WriteLine(string.Format("{0}没有成功", url)); nextUrl = ""; } } #endregion #region 读取所有书,并下载到本地... bookName = string.Empty; //保存此书的路径... string bookPath = string.Empty; //书的ID //string bookIDStr = string.Empty; string beginReadUrl = string.Empty; //一个章节的标题(分段阅读) string bookTitlePage = string.Empty; //一个章节的文本... string bookTextPage = string.Empty; StringBuilder bookTextBuil = new StringBuilder(); string nextTextPage = string.Empty; FileStream fs = null; StreamWriter sw = null; //循环书List... foreach (var item in bookInfoDic) { bookTextBuil.Clear(); try { bookPath = string.Format("D:\yuzhaiwu\{0}.txt", item.Key); if (File.Exists(bookPath)) { fs = new FileStream(bookPath, FileMode.Append); } else { fs = new FileStream(bookPath, FileMode.Create); } sw = new StreamWriter(fs, Encoding.UTF8); //进入书的主页... var mainPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(item.Value); //开始阅读... var beginReadEle = mainPage.FindFirst("#novelMain a.btn"); beginReadUrl = beginReadEle.Attribute("href").Value(); nextTextPage = (host + beginReadUrl); //下一页下一页... while (nextTextPage != "") { //各个章节... var firstPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(nextTextPage, Encoding.UTF8, true); #region 如果发现页面中所有的html代码在一个title中的话...读取title中的html代码,在转换... //string htmlPage = firstPage.FindFirst("title").InnerHtml(); //var firstPageTemp = new JumonyParser().Parse(htmlPage); //bookTextPage = firstPageTemp.FindFirst("#nr1").InnerText(); #endregion bookTitlePage = firstPage.FindFirst("#nr_title").InnerText(); //bookTextPage = firstPage.FindFirst("#nr1").InnerText(); bookTextPage = firstPage.FindFirst("#nr1").InnerHtml().Replace("<p>", " ").Replace("</p>", " ").Replace("<p></p>", ""); bookTextBuil.AppendFormat(" {0} {1}", bookTitlePage, bookTextPage); //获取下一章节路径... var nextPageEle = firstPage.FindFirst("#nr_body a#pb_next");//#nr_body div.nr_page table tbody tr td.next a#pb_next nextTextPage = nextPageEle.Attribute("href").Value(); //如果相等,表明是最后一页了... if (nextTextPage == item.Value) { nextTextPage = ""; } } //var bookIDEle = mainPage.FindFirst("SOHUCS"); //bookIDStr = bookIDEle.Attribute("sid").Value(); } catch (System.IO.IOException ioEx) { MessageBox.Show(ioEx.Message); } catch (Exception ex) { MessageBox.Show(ex.Message); } finally { //Console.WriteLine(title); //sw.WriteLine(""); //sw.WriteLine(title); //sw.WriteLine(""); sw.WriteLine(bookTextBuil.ToString()); } } sw.Close(); fs.Close(); #endregion MessageBox.Show("全部成功!"); } } }
x