前几天看到同事在网上复制、粘贴管理方面的文章,一遍一遍地重复,这让我想到可不可写一个程序来完成呢,于是上网查资料,终于给他解决了,代码如下:
using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Text.RegularExpressions; using System.IO; using System.Text; namespace WebUI { public partial class TestWebClient : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { } protected void btnDownLoad_Click(object sender, EventArgs e) { for (int i = 1507; i <= 1507; i++) { string url = "http://www.ccmcsz.com/management/" + i + ".htm"; Response.Write(url); SetLog(url, i.ToString()); Response.Write("<br/>"); } } public void SetLog(string url, string name) { try { string filepath = @"D:\Test163\"; Encoding defaultencode = Encoding.GetEncoding("gb2312"); string FileName = name + ".txt"; string NewFilePath = Path.Combine(filepath, FileName); if (!Directory.Exists(filepath)) { Directory.CreateDirectory(filepath); } System.Net.WebClient wc = new System.Net.WebClient(); Stream ss = wc.OpenRead(url); StreamReader rd = new StreamReader(ss, defaultencode); string message = rd.ReadToEnd(); rd.Close(); wc.Dispose(); message = DelHTML(message); StreamWriter Sw = new StreamWriter(NewFilePath, true, defaultencode); Sw.Write(message); Sw.Flush(); Sw.Close(); Sw = null; } catch { this.Response.Write(url + "<br/>"); } } public static string DelHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); //Htmlstring = Regex.Replace(Htmlstring,@"<A>.*</A>",""); //Htmlstring = Regex.Replace(Htmlstring,@"<[a-zA-Z]*=\.[a-zA-Z]*\?[a-zA-Z]+=\d&\w=%[a-zA-Z]*|[A-Z0-9]",""); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("\r\n", ""); //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; } } }
等待更新...