• 想看小说,自己写个采集类,读网页文章写入txt文件


    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    
    namespace allen
    {
        class Program
        {
            /// <summary>
            /// 根据网址取得HTML代码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            static string GetHtml(string url)
            {
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                Stream stream = response.GetResponseStream();
                StreamReader reader = new StreamReader(stream, Encoding.Default);
                string html = reader.ReadToEnd();
                stream.Close();
                return html;
            }
            static Regex reg;
            /// <summary>
            /// 过滤器,留下文章正文
            /// </summary>
            /// <param name="htmlStr"></param>
            /// <returns></returns>
            static string MyFilter(string htmlStr)
            {
                reg = new Regex(@"\s+");//先把任意空白符做掉
                htmlStr = reg.Replace(htmlStr, "");
                reg = new Regex("点此下载封神演义.txt</font></font></a></div></td>.*</div></td></tr><tr><tdclass=");//匹配出正文
                Match match = reg.Match(htmlStr);
                string result = match.Value;
                result = result.Replace("点此下载封神演义.txt</font></font></a></div></td>", "");
                result = result.Replace("</div></td></tr><tr><tdclass=","");
                result = result.Replace("</tr></table>", "");
                result = result.Replace("本文章下载于www.Txt66.com", "");
                result = result.Replace("<br>",Environment.NewLine);
                return result;
            }
            /// <summary>
            /// 循环读取每页的文章,写入记事本
            /// </summary>
            static void WriteFile()
            {
                int page_num = 1;
                string url = "http://www.txt66.com/read2.asp?id=8480&PageNum={0}";
                string url_temp = string.Empty;
                string html = string.Empty;
                string text = string.Empty;
                StreamWriter sw = new StreamWriter(@"F:\g.txt", true, Encoding.Unicode);
                while (page_num < 124)
                {
                    url_temp = string.Format(url, page_num);
                    html = GetHtml(url_temp);
                    text = MyFilter(html);
                    sw.Write(text);
                    Console.WriteLine("写入第{0}页", page_num);
                    System.Threading.Thread.Sleep(600);
                    page_num++;
                }
                sw.Close();
            }
            /// <summary>
            /// 主函数
            /// </summary>
            /// <param name="args"></param>
            static void Main(string[] args)
            {
                WriteFile();
                Console.ReadKey();
            }
        }
    }
    
    
  • 相关阅读:
    HTML中为何P标签内不可包含块元素?
    js判断鼠标位置是否在某个div中
    拒绝图片延迟加载,爽爽的看美图
    PHP为什么会被认为是草根语言?
    宜信开源微服务任务调度平台(SIA-TASK)
    JSBridge框架解决通信问题实现移动端跨平台开发
    如何运用多阶构建编写优雅的Dockerfile
    Sharding-JDBC 使用入门和基本配置
    程序员笔记|详解Eureka 缓存机制
    程序员笔记|常见的Spring异常分析及处理
  • 原文地址:https://www.cnblogs.com/liulun/p/1679690.html
Copyright © 2020-2023  润新知