学生时代的小玩具
用C#.net开发的一个用来抓取当当网计算机类图书信息的工具
Program.cs
using System; using System.Collections.Generic; using System.Linq; using System.Windows.Forms; namespace spider { static class Program { /// <summary> /// 应用程序的主入口点。 /// </summary> [STAThread] static void Main() { Application.EnableVisualStyles(); Application.SetCompatibleTextRenderingDefault(false); Application.Run(new Form1()); } } }
Form1.cs
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.IO; namespace spider { public partial class Form1 : Form { private string url = @"http://category.dangdang.com/all/?category_path=01.54.26.00.00.00&page_index="; private static int page = 1; private Parse p; public Form1() { InitializeComponent(); } private void buttonstart_Click(object sender, EventArgs e) { page = 1; Execute(); } private void buttonprev_Click(object sender, EventArgs e) { page--; Execute(); } private void buttonnext_Click(object sender, EventArgs e) { page++; Execute(); } private void buttonjump_Click(object sender, EventArgs e) { page = int.Parse(textBox2.Text); Execute(); } private void Execute() { webBrowser1.Navigate(url + page.ToString()); textBox1.Text = url + page.ToString(); Cursor.Current = Cursors.WaitCursor; } private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { HtmlDocument doc = webBrowser1.Document; p = new Parse(doc); DataTable dt = p.dt; dataGridView1.DataSource = dt; dataGridView1.Columns[0].Width = 150; dataGridView1.Columns[1].Width = 150; dataGridView1.Columns[2].Width = 150; dataGridView1.Columns[3].Width = 80; dataGridView1.Columns[4].Width = 450; Cursor.Current = Cursors.Default; MessageBox.Show("解析完成"); } private void buttonsave_Click(object sender, EventArgs e) { SaveFileDialog sfd = new SaveFileDialog(); sfd.DefaultExt = "txt"; if (sfd.ShowDialog() == DialogResult.OK) { string path = sfd.FileName; StringBuilder sb = new StringBuilder(); List<Book> list = p.list; foreach (Book book in list) { sb.Append(book.ToString()); } string text = sb.ToString(); File.AppendAllText(path, text, Encoding.Default); MessageBox.Show("保存成功\n" + path); } } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace spider { class Book { public string name { get; set; } public string author { get; set; } public string pub { get; set; } public string time { get; set; } public string describ { get; set; } public Book() { } public Book(string name, string author, string pub, string time, string describ) { this.name = name; this.author = author; this.pub = pub; this.time = time; this.describ = describ; } public override string ToString() { return "书名:" + name + "\r\n" + "作者:" + author + "\r\n" + "出版商:" + pub + "\r\n" + "出版时间:" + time + "\r\n" + "描述:" + describ + "\r\n\r\n"; } } }
Parse.cs
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Windows.Forms; using System.Data; using System.Text.RegularExpressions; namespace spider { class Parse { private HtmlDocument dom; public DataTable dt { get; set; } public List<Book> list { get; set; } public Parse(HtmlDocument dom) { this.dom = dom; dt = new DataTable(); list = new List<Book>(); dt.Columns.Add("书名"); dt.Columns.Add("作者"); dt.Columns.Add("出版社"); dt.Columns.Add("出版时间"); dt.Columns.Add("描述"); Execute(); } public void Execute() { HtmlElementCollection els = dom.GetElementsByTagName("div"); foreach (HtmlElement el in els) { if (el.GetAttribute("classname") == "listitem detail")//图书信息 { Book book = new Book(); HtmlElementCollection els2 = el.GetElementsByTagName("li"); foreach (HtmlElement el2 in els2) { if (el2.GetAttribute("classname") == "maintitle")//书名 { book.name = el2.OuterText; } if (el2.GetAttribute("classname") == "publisher_info") { HtmlElementCollection els3 = el2.GetElementsByTagName("a"); StringBuilder sb = new StringBuilder(); foreach (HtmlElement el3 in els3) { if (el3.GetAttribute("name") == "Author")//作者 { if (sb.Length==0) { sb.Append(el3.OuterText); } else { sb.Append("," + el3.OuterText); } } if (el3.GetAttribute("name") == "Pub")//出版商 { book.pub = el3.OuterText; } } book.author = sb.ToString(); Regex r = new Regex(@"(\d{4})\-(\d{2})\-(\d{2})"); Match m = r.Match(el2.OuterText); if (m.Success)//出版时间 { book.time = m.Value; } } if (el2.GetAttribute("classname") == "describ")//描述 { book.describ = el2.OuterText; } } DataRow dr = dt.NewRow(); dr["书名"] = book.name; dr["作者"] = book.author; dr["出版社"] = book.pub; dr["出版时间"] = book.time; dr["描述"] = book.describ; dt.Rows.Add(dr); list.Add(book); } } } } }