• C#/winform采集百度hi文章


    public partial class Form1 : Form

        {

            Thread newth;

            public Form1()

            {

                InitializeComponent();

             

            }    

            private void buttonGo_Click(object sender, EventArgs e)

            {

                CheckForIllegalCrossThreadCalls = false; //简单异步线程控制设置       

                 newth = new Thread(new ThreadStart(doit));

                newth.Start();

            }

            void doit()

            { //HttpWebRequest 对象采集百度hi blog文章

                HttpWebRequest webRequest; //请求对象

                StreamReader responseReader;//响应对象

                string responseData;

                html mytml; //自定义html简单处理对象,处理文章页面数据

                DataTable dt = new DataTable();//存储文章列表

                DataTable dt2 = new DataTable();//存储文章内容

                int pagecount = 0;

                dt.Columns.Add(new DataColumn("title"));//标题

                dt.Columns.Add(new DataColumn("link"));//链接

                dt.Columns.Add(new DataColumn("description"));//文章内容

                dt.Columns.Add(new DataColumn("pubDate"));//发表时间

                dt.Columns.Add(new DataColumn("category"));//文章分类

                dt2.Columns.Add(new DataColumn("title"));

                dt2.Columns.Add(new DataColumn("link"));

                dt2.Columns.Add(new DataColumn("description"));

                dt2.Columns.Add(new DataColumn("pubDate"));

                dt2.Columns.Add(new DataColumn("category"));

                string url = "http://hi.baidu.com/306759613/blog/index/";//文章列表第一页为http://hi.baidu.com/306759613/blog/index/0

                string arcurl="http://hi.baidu.com/306759613/blog/item/";//文章所在路径

                //find page count

                //from index 0

                webRequest = WebRequest.Create(url + 0) as HttpWebRequest;

                webRequest.Timeout = 3000;//请求延时设置

                WebResponse reponse = webRequest.GetResponse();

              //gb2312读取数据

                responseReader = new StreamReader(

                reponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

                );

                responseData = responseReader.ReadToEnd();//读取整个页面

                responseReader.Close();

                mytml = new html(responseData);//创建html页面处理对象

                List<string> regpsl = mytml.getElementsByRegex(@"/blog/index/[\d]+");//获取分页链接的正则

              List<int> pagenum = new List<int>();//存储页码

              foreach (string a in regpsl) {

              pagenum.Add(int.Parse(a.Replace("/blog/index/", "")));

              }

              pagecount = pagenum.Max() + 1;//pagenum中最大值为尾页页码,页面从0开始编号,页数为页面数+1       

              mytml = null;

                this.progressBar1.Value = 0; //进度条

                for (int i = 0; i < pagecount; i++)

                {

               webRequest = WebRequest.Create(url+i) as HttpWebRequest;//读取各分页

               webRequest.Timeout = 3000;

                responseReader = new StreamReader(

                webRequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

                );

                 responseData = responseReader.ReadToEnd();

                 responseReader.Close();

                 mytml = new html(responseData);

                 DataTable dti = mytml.getAritcleTable();//获取该分页文章列表

                 this.progressBar1.Value = (i * 100 / pagecount);

                 this.label1.Text = this.progressBar1.Value + "%"; //进度条


             for (int j = 0; j < dti.Rows.Count; j++)

             {

                 dt.Rows.Add(dti.Rows[j].ItemArray);//插入该文章到总文章表

                 HttpWebRequest subrequest = WebRequest.Create(arcurl+dti.Rows[j][1]+".html") as HttpWebRequest;//读取文章信息

                 subrequest.Timeout = 3000;

                 StreamReader subre = new StreamReader (subrequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));

                 string tmphtml = subre.ReadToEnd();

                 //处理文章页面html开始

                 int start = tmphtml.IndexOf("<div id=\"blog_text\" class=\"cnt\">")+"<div id=\"blog_text\" class=\"cnt\">".Length;

                 int end = tmphtml.IndexOf("</div",start);

                 tmphtml = tmphtml.Substring(start,end-start);//取得文章内容

                 dt2.Rows.Add(new object[] { dti.Rows[j].ItemArray[0], dti.Rows[j].ItemArray[1], tmphtml, dti.Rows[j].ItemArray[3] });//插入文章数据到文章表

                 subre.Close();

                 this.progressBar1.Value = (i * 100 / pagecount) + (j * 25 / dti.Rows.Count);//设置进度条

                 this.label1.Text = this.progressBar1.Value + "%"; //显示百分比           

                 writeXML(dt2, "f:\\p\\" + dti.Rows[j][1] + ".xml");//将文章以xml格式输出

                 dt2.Rows.Clear();

                 subre.Close();

                 subrequest = null;

             }

                 webRequest = null;

                 responseReader.Close();

                 responseReader = null;

                 responseData = string.Empty;

                }

                this.progressBar1.Value =this.progressBar1.Maximum;//进度100%

                this.label1.Text = this.progressBar1.Value + "%";

                 this.dataGridView1.DataSource = dt;//显示文章列表数据

                  writeXML(dt, "f:\\p\\Articel.xml");//输出文章内容数据到xml文件

                textBoxDebug.Text = textBoxDebug.Text+ "写入完毕\r\n";

            }

        /// <summary>

            /// 将数据表输出到xml

            /// </summary>

            /// <param name="dt"></param>

            /// <param name="fileName"></param>

            public void writeXML(DataTable dt, string fileName)

            {

                string xmlstr ="<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n";

                xmlstr += "<?xml-stylesheet href=\"t.xsl\" type=\"text/xsl\"?>\r\n";

               xmlstr +="<root>\r\n";

                dt.TableName = "articels";

                System.Xml.XmlDocument xml = new System.Xml.XmlDocument();

                for (int k = 0; k < dt.Rows.Count; k++)

                {

                    xmlstr = xmlstr + "<" + dt.TableName + ">\r\n";

                    for (int l = 0; l < dt.Columns.Count; l++)

                    {

                        xmlstr = xmlstr + "<" + dt.Columns[l].ColumnName + ">\r\n<![CDATA[\r\n";

                        xmlstr = xmlstr + dt.Rows[k][l] + "\r\n";

                        xmlstr = xmlstr + "]]>\r\n</" + dt.Columns[l].ColumnName + ">\r\n";

                    }

                    xmlstr = xmlstr + "</" + dt.TableName + ">\r\n";

                }

                xmlstr += "</root>\r\n";

                StreamWriter w = new StreamWriter(fileName, false, System.Text.Encoding.UTF8);//utf8保存

                w.Write(xmlstr);

                w.Close();

            }

        }

        class html

        {

            string htmltext=string.Empty;

            /// <summary>

            /// 构造函数

            /// </summary>

            /// <param name="htmltext"></param>

            public html( string htmltext) {

                this.htmltext = htmltext;       

            }

            /// <summary>

            /// 获取文章列表

            /// </summary>

            /// <returns></returns>

            public DataTable getAritcleTable(){

                DataTable dt = new DataTable();

                int start = htmltext.IndexOf("div id=\"m_blog\" class=\"modbox\">");//起始位置

                int end = htmltext.IndexOf("<div id=\"mod_artclg\" class=\"mod\">");//结束位置

                string htm = htmltext.Substring(start-1, end - start -1 );

                dt.Columns.Add(new DataColumn("title"));

                dt.Columns.Add(new DataColumn("link"));

                dt.Columns.Add(new DataColumn("description"));

                dt.Columns.Add(new DataColumn("pubDate"));

                dt.Columns.Add(new DataColumn("category"));

                string title, link, description, pubDate, category,temp;

                int nstart, nend;//记录上次提取位置

                start = 0;

                do

                {//遍历html文档 提取文章信息

                    nstart = htm.IndexOf("<div class=\"tit\">",start) + "<div class=\"tit\">".Length;

                    if (nstart < start) break;

                    start = nstart;

                    nend = htm.IndexOf("</div>",start);

                    start = nend + 5;

                    temp = htm.Substring(nstart, nend - nstart );

                    nstart = temp.IndexOf(">");

                    nend =temp.IndexOf("</a>");

                    title = temp.Substring(nstart + 1, nend-nstart-1 );//文章标题

                    nstart = temp.IndexOf("\"");

                    nend = temp.IndexOf("\"", nstart + 1);

                    link = temp.Substring(nstart + 1, nend - nstart-1 );//链接

                    nstart = link.IndexOf("item/")+"item/".Length;

                    nend = link.IndexOf(".html");

                    link = link.Substring(nstart, nend - nstart);//取文件名(去除扩展名)

                    nstart = htm.IndexOf("<div class=\"date\">", start)+ "<div class=\"date\">".Length;

                    start = nstart;

                    nend = htm.IndexOf("</div>", start);

                    pubDate = htm.Substring(nstart , nend - nstart);//发表日期

                    start = nend + 5;

                    nstart = htm.IndexOf("<div class=\"cnt\">", start) + "<div class=\"cnt\">".Length;

                    start = nstart;

                    nend = htm.IndexOf("</div>", start);

                    start = nend + 5;

                    description = htm.Substring(nstart, nend - nstart );//文章内容

                    nstart = htm.IndexOf("<div class=\"opt\">", start) + "<div class=\"opt\">".Length;

                    start = nstart;

                    nend = htm.IndexOf("</div>", start);

                    start = nend + 5;

                    temp = htm.Substring(nstart, nend - nstart );

                    nstart = temp.IndexOf("");

                    nend =temp.IndexOf("</a>");

                    category=temp.Substring(nstart + 1, nend - nstart - 1); //文章分类            

    dt.Rows.Add(new string[] { title, link, description, pubDate, category });

                } while (nstart > 0);

             

                return dt;

            }

        }

  • 相关阅读:
    将文件导入到SQL server数据库表中的字段中
    查看端口是否启用
    JS去除字符串左右两端的空格
    css常见问题
    iframe之局部刷新
    iframe局部刷新的二种实现方法
    模式识别复习目录
    linux下文件内容查找 转
    LaTeX技巧10:LaTeX数学公式输入初级入门
    matlab中高维数组怎么做PCA?
  • 原文地址:https://www.cnblogs.com/top5/p/1542578.html
Copyright © 2020-2023  润新知