• C#制作网盘搜索工具(简单的爬虫)


          最近学习C#编程,在网上发现一篇winform下制作百度网盘搜索器的文章,故而下载源码学习一二。无奈原博所用的网址失效,故而自己改写了网址和相关源代码,也进行了实现。因为初学,接触的知识较多,为免忘记,进行整理复习。

    1.知识点:

          思路:主要是利用HttpWebRequest,HttpWebResponse进行http模拟请求,然后利用HtmlAgilityPack+XPath语法对html dom进行元素获取,将截取到的相关内容在datagridview中展示,最后利用process.start()方法进行点击访问。

    2.具体实现:

    2.1关于请求头的获取:

    本例子使用网址为:http://www.pansoso.com/

    分析上述网址的请求头进行模拟:


    查看具体请求头信息:


    根据获取的request url分析出其请求地址的规律为:所搜索的关键字:hello直接利用get方法添加到了url的最后,其中页数规律为hello_1,hello_2。。。(每页十条记录)

    2.2关于结果的获取:

    结果的获取,直接利用对response网页的分析截取关键信息即可。

    3.代码实现:

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    
    namespace 百度网盘资源搜索
    {
    class HttpHelper
    {
       
        static readonly string urlTemplate = "http://www.pansoso.com/zh/{0}";
        public static string Requset(string key)
        { 
            string url = string.Format(urlTemplate, key);
            //Console.WriteLine(url);
            HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(url);
            httpRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
            httpRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
            httpRequest.Host = "www.pansoso.com";
            httpRequest.Referer = "http://www.pansoso.com/zh/" + Uri.EscapeUriString(key);
            try
            {
                HttpWebResponse httpResponse = (HttpWebResponse)httpRequest.GetResponse();
                Stream s = httpResponse.GetResponseStream();
                StreamReader sr = new StreamReader(s);
                string jsonString = sr.ReadToEnd();
               
                
                //Console.WriteLine(jsonString);
                //string jsonProcessed = null;
                //if ((jsonProcessed = JsonPreProcessing(jsonString)) != null)
                //{
                 //  SearchResult searchResult = UtilityClass.GetObject<SearchResult>(jsonProcessed);
                //    return searchResult;
                //}
                return jsonString;
            }
            catch
            {
                return null;
            }
        }
        public static SearchResult dodata(string str)
        {
            SearchResult searchResult = UtilityClass.GetObject<SearchResult>(str);
            return searchResult;
            
        }
       
            //if (doc.DocumentNode.SelectNodes("//comment()") != null)
            //{
            //    foreach (var commet in doc.DocumentNode.SelectNodes("//comment"))
            //    {
            //        commet.Remove();
            //    }
            //}
    
       
        public static string JsonPreProcessing(string jsonString)
        {
            int startIndex = jsonString.IndexOf("(");
            if (startIndex > 0)
            {
                string json = jsonString.Substring(startIndex + 1);
                return "{"resources":" + json.Remove(json.Length - 3) + "}";
            }
            else
            {
                return null;
            }
        }
    }
    }
    

    Utility.Class

    using System;
    using System.Collections.Generic;
    using System.IO;
    //using System.Linq;
    using System.Runtime.Serialization.Json;
    using System.Text;
    
    namespace 百度网盘资源搜索
    {
        class UtilityClass
        {
        public static T GetObject<T>(string json)
        {
            DataContractJsonSerializer serializer = new DataContractJsonSerializer(typeof(T));
            MemoryStream ms = new MemoryStream(Encoding.UTF8.GetBytes(json));
            T obj = (T)serializer.ReadObject(ms);
            return obj;
        }
        }
    }
    

    JSontoObject.cs

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    
    namespace 百度网盘资源搜索
    {
    public class SearchResult
    {
        public BDWPResource[] resources { get; set; }
    }
    
    public class BDWPResource
    {
        public string title { get; set; }
        public string content { get; set; }
        public string unescapedUrl { get; set; }
    }
    
    }
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Threading;
    using System.Diagnostics;
    
    namespace 百度网盘资源搜索
    {//主窗体
        public partial class FrmMain : Form
        {
            bool isSearch = true;
            string url = "http://www.pansoso.com";
            public FrmMain()
            {
                InitializeComponent();
            }
    
            private void btnSearch_Click(object sender, EventArgs e)
            {
                
              
                string key = this.txtKey.Text;
                if (!string.IsNullOrEmpty(key))
                {
                    this.dataGridView1.Rows.Clear();
                    this.lblResult.Text = "0";
                    this.pgsBar.Value = 0;
                    this.btnSearch.Text = "正在搜索";
                    this.btnSearch.Enabled = false;
                    this.btnStop.Enabled = true;
                    Thread thread = new Thread(() =>
                    {
                        for (int i = 1; i < 11; i ++)//共取得10页网页数据
                        {
                            if (isSearch)
                            {
                                
                                    gethtml(HttpHelper.Requset(key+"_"+i));
                                    
                               //gethtml(HttpHelper.Requset(key));
                               //if(textBox1.Text!=null)
                               //{
                               //    string name=textBox1.Text;
                               //   SearchResult sr= HttpHelper.dodata(name);
                               //   if (sr != null)
                               //   {
                               //       foreach (BDWPResource resource in sr.resources)
                               //       {
                               //           BindResource(resource);
                               //       }
                             //   }
                              // }
                               // webBrowser1.DocumentText = HttpHelper.Requset(key);
                                    // Navigate to HTML document string
                                    //webBrowser1.Navigate(HttpHelper.Requset(key));
                                
                             
                               // SearchResult sr = HttpHelper.Requset(key);
                              
                            }
                            else break;
                        }
                        //搜索完成
                        SearchOver();
                       
                    });
                    thread.IsBackground = true;
                    thread.Start();
                }
            }
    
            public void gethtml(string docs)
            {
                try
                {
                      HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(docs);
                if (doc.DocumentNode.SelectNodes("//script") != null)
                {
                    foreach (var script in doc.DocumentNode.SelectNodes("//script"))
                    {
                        script.Remove();
                    }
                      HtmlAgilityPack.HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
                HtmlAgilityPack.HtmlNodeCollection list2 = doc.DocumentNode.SelectNodes(".//div[@class='des']");
                HtmlAgilityPack.HtmlNodeCollection list3 = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
                if (hrefList != null && list2 != null && list3 != null)
                {
    
                    for (int i = 0; i < list2.Count; i++)
                    {
                       string url1 = url + list3[i].Attributes["href"].Value;
                       string json = "title:" + hrefList[i].InnerText + "content:" + list2[i].InnerText + "unescapedUrl:" +"【"+url1+"】" ;
                       // Process.Start(url1);
                        SearchOver1(json);
                        this.Invoke(new Action<string, string, string>((tle, ctt, url3) =>
                        {
                            this.dataGridView1.Rows.Add(tle, ctt, url3);
                            this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
                            if (this.pgsBar.Value < this.pgsBar.Maximum)
                            {
                                this.pgsBar.Value++;
                            }
                        }), hrefList[i].InnerText,list2[i].InnerText, url1);
                    }
    
                }
                }
                }
    
                catch (Exception)
                {
    
                    MessageBox.Show("该关键字没有收录资源!!!");
                }
              
                }
                //if (doc.DocumentNode.SelectNodes("//style") != null)
                //{
                //    foreach (var style in doc.DocumentNode.SelectNodes("style"))
                //    {
                //        style.Remove();
                //    }
                //}
    
        
    
            private void BindResource(BDWPResource resource)
            {
                string title = resource.title.Replace("</b>", "").Replace("<b>", "");
                string content = resource.content.Replace("</b>", "").Replace("<b>", "");
    
                this.Invoke(new Action<string, string, string>((tle, ctt, url) =>
                {
                    this.dataGridView1.Rows.Add(tle, ctt, url);
                    this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
                    if (this.pgsBar.Value < this.pgsBar.Maximum)
                    {
                        this.pgsBar.Value++;
                    }
                }), title, content, resource.unescapedUrl);
            }
    
            private void SearchOver()
            {
                this.Invoke(new Action(() =>
                {
                    this.btnSearch.Text = "开始搜索";
                    this.btnSearch.Enabled = true;
                    this.btnStop.Enabled = false;
                    this.isSearch = true;
                }));
            }
            public void SearchOver1(string str)
            {
                this.Invoke(new Action(() =>
                {
                    this.richTextBox1.Text += str + System.Environment.NewLine;
                  
                 
                }));
            }
            private void dataGridView1_RowPostPaint(object sender, DataGridViewRowPostPaintEventArgs e)
            {
                SolidBrush b = new SolidBrush(this.dataGridView1.RowHeadersDefaultCellStyle.ForeColor);
                e.Graphics.DrawString((e.RowIndex + 1).ToString(System.Globalization.CultureInfo.CurrentUICulture), this.dataGridView1.DefaultCellStyle.Font, b, e.RowBounds.Location.X + 20, e.RowBounds.Location.Y + 6);
                e.Graphics.FillRectangle(Brushes.White, new Rectangle(new Point(e.RowBounds.Location.X + 2, e.RowBounds.Location.Y + 2), new Size(20, 20)));//隐藏每行前面的图标
            }
    
            //打开网页链接
            private void dataGridView1_CellDoubleClick(object sender, DataGridViewCellEventArgs e)
            {
                if (e.RowIndex > -1)
                {
                    string url = this.dataGridView1.Rows[e.RowIndex].Cells[2].Value.ToString();
                    Process.Start(url);//进行打开浏览器的方法。
                }
            }
    
            private void btnStop_Click(object sender, EventArgs e)
            {
                isSearch = false;
                this.btnSearch.Enabled = true;
            }
    
            private void richTextBox1_LinkClicked(object sender, LinkClickedEventArgs e)
            {
                System.Diagnostics.Process.Start(e.LinkText);
            }
        }
    }
    

    4.效果实现:




  • 相关阅读:
    银行数据仓库体系实践(6)--调度系统
    银行数据仓库体系实践(5)--数据转换
    银行数据仓库体系实践(4)--数据抽取和加载
    银行数据仓库体系实践(3)--数据架构
    银行数据仓库体系实践(2)--系统架构
    银行数据仓库体系实践(1)--银行数据仓库简介
    敏捷开发和传统开发区别
    source ~/.bash_profile是什么意思
    oracle里的DBLINK是做什么的
    flex-direction的值及使用效果
  • 原文地址:https://www.cnblogs.com/cache-yuan/p/10104249.html
Copyright © 2020-2023  润新知