• C#版采集程序源码介绍


    因为工作需要,自己写了一个采集程序,如果冒犯了你的网站,我在这里说一声对不起 !!
      哎~!我只是一个普通的程序员.
      namespace CJ
      {
       public partial class Form1 : Form
       { 
       public int proxy = 0;
       public int keyi = 0;
       public int keyj = 0;
       public int keym = 0;
       public int keyn = 0;
       public int sum = 0;
       public string newurl = "";
       public string cururl = "";
       public string dirname = "";
       public string curdir = "";
       public string responseFromServer = "";
       public string filename = "";
       public string sql = "";
       public string mulu = "";
       StringBuilder sbs = new StringBuilder();
       List<Class1> cls = new List<Class1>();
       public ArrayList al = new ArrayList();
       public string insertdl = "insert into mzinedl values(";
       public string insertxl = "insert into mzinexl values(";
       public string insertinfo = "insert into mzineinfo values(";
       public string insertwz = "insert into mzinewz values(";
       public Form1()
       {
       InitializeComponent();
       }
       /// <summary>
       /// 保存网页
       /// </summary>
       /// <param name="FILE_NAME">文件的路径</param>
       /// <param name="data">数据</param>
       public void TextToFile(string FILE_NAME, string data)
       {
       if (File.Exists(FILE_NAME))
       {
       return;
       }
       using (StreamWriter sw = File.CreateText(FILE_NAME))
       {
       sw.Write(data);
       sw.Close();
       }
       }
       /// <summary>
       /// 下载文件
       /// </summary>
       /// <param name="PageUrl">网址</param>
       /// <param name="filename">保存文件路径</param>
       public void DownFile(string PageUrl, string filename)
       {
       if (!Directory.Exists(filename))
       {
       Directory.CreateDirectory(filename);
       }
       string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);
       string dirname = filename + "\\" + path;
       if (File.Exists(dirname))
       {
       return;
       }
       else
       {
       try
       { 
       WebClient wc = new WebClient();
       WebProxy wp = new WebProxy(al[proxy].ToString(), true);
       wc.Proxy = wp;
       wc.DownloadFile(PageUrl, dirname);
       }
       catch (WebException ex)
       {
       if (ex.Status == WebExceptionStatus.ConnectFailure)
       {
       //无法连接到远程服务器, --换代理 IP
       //MessageBox.Show(ex.ToString());
       proxy++;
       if (proxy >= al.Count)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       }
       DownFile(PageUrl, filename);
       }
       else if (ex.Status == WebExceptionStatus.Timeout)
       {
       //超时 --换代理 IP
       //MessageBox.Show(ex.ToString());
       proxy++;
       if (proxy >= al.Count)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       }
       DownFile(PageUrl, filename);
       }
       else if (ex.Status == WebExceptionStatus.ProtocolError)
       {
       //文件未找到--跳出 
       //MessageBox.Show(ex.ToString());
       return;
       }
       }
       }
       }
       /// <summary>
       /// 读文件
       /// </summary>
       /// <param name="FILE_NAME">文件的路径</param>
       /// <returns>数据</returns>
       public ArrayList ReadIPproxy(string FILE_NAME)
       { 
       using (StreamReader sr = File.OpenText(FILE_NAME))
       {
       String input;
       while ((input = sr.ReadLine()) != null)
       {
       al.Add(input);
       } 
       sr.Close();
       }
       return al;
       }
       /// <summary>
       /// 数据库
       /// </summary>
       public void Executesql()
       {
       SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);
       }
       /// <summary>
       /// 读文件
       /// </summary>
       /// <param name="FILE_NAME">文件的路径</param>
       /// <returns>数据</returns>
       public string FileToText(string FILE_NAME)
       {
       string data;
       using (StreamReader sr = File.OpenText(FILE_NAME))
       {
       data=sr.ReadToEnd();
       sr.Close();
       }
       return data;
       }
       /// <summary>
       /// 保存SQL
       /// </summary>
       /// <param name="sql"></param>
       public void SaveSqls(string sql)
       {
       sbs.Append(sql).Append(" ");
       } 
       /// <summary>
       /// 请求失败的时候,反复操作
       /// </summary>
       /// <param name="PageUrl"></param>
       /// <returns></returns>
       public string ToServer(string PageUrl)
       {
       string responseFromServer = "";
       
       try
       { 
       while (1 == 1)
       {
       WebRequest request = WebRequest.Create(PageUrl);
       WebProxy wp = new WebProxy(al[proxy].ToString(), true);
       request.Proxy = wp;
       request.Timeout = 1000 * 60;
       HttpWebResponse response = (HttpWebResponse)request.GetResponse();
       Stream dataStream = response.GetResponseStream();
       StreamReader reader=null;
       try
       {
       reader = new StreamReader(dataStream, System.Text.Encoding.Default);
       responseFromServer = reader.ReadToEnd();
       }
       catch 
       {
       proxy++;
       if (proxy >= al.Count)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       }
       ToServer(PageUrl);
       };
       reader.Close();
       dataStream.Close();
       response.Close();
       if (responseFromServer.Contains("refresh") || responseFromServer == "")
       {
       proxy++;
       if (proxy >= al.Count)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       }
       //ToServer(PageUrl);
       }
       else
       {
       break;
       }
       }
       }
       catch (WebException ex)
       { 
       if (ex.Status == WebExceptionStatus.ProtocolError)
       { 
       responseFromServer = "";
       }
       else
       {
       proxy++;
       if (proxy >= al.Count)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       }
       ToServer(PageUrl);
       }
       } 
       return responseFromServer;
       }
       /// <summary>
       /// 保存XML 文件
       /// </summary>
       public void SaveXmls()
       {
       string pathxml = "";
       foreach (Class1 c in cls)
       {
       Class1 s = c;
       pathxml = s.address;
       if (!File.Exists(pathxml))
       {
       XmlSerializer xs = new XmlSerializer(typeof(Class1));
       Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
       xs.Serialize(stream, s);
       stream.Close();
       } 
       
       }
       }
       /// <summary>
       /// 移除HTMl 标记
       /// </summary>
       /// <param name="Html"></param>
       /// <param name="RegStr"></param>
       /// <returns></returns>
       public static string Remove(string Html)
       {
       //Regex Reg = new Regex(RegStr);
       //foreach (Match m in Reg.Matches(Html))
       //{
       // Html = Html.Replace(m.Value, "");
       //}
       //return Html.Trim();
       string regesstr = "<.*?>";
       return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);
       }
       public static string FilterScript(string content)
       {
       string regexstr = @"<(script)[^>]*>(\s*|.)*</\1>";
       return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);
       }
       /// <summary>
       /// 过略所有的 危险标记
       /// </summary>
       /// <param name="html"></param>
       /// <returns></returns>
       public string wipeScript(string html)
       {
       System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
       System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
       System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
       System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
       System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
       html = regex1.Replace(html, ""); //过滤<script></script>标记 
       html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性 
       html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件 
       html = regex4.Replace(html, ""); //过滤iframe 
       html = regex5.Replace(html, ""); //过滤frameset 
       return html;
       }
       public void HtmlSource(string urlpri)
       {
       //要写入的文件路径
       filename = "E:\\观2\\magazine.html";
       if (!Directory.Exists("E:\\观2"))
       {
       Directory.CreateDirectory("E:\\观2");
       }
       if (File.Exists(filename))
       {
       responseFromServer=FileToText(filename); //存在
       
       }
       else
       {
       responseFromServer = ToServer(urlpri); //不存在
       
       }
       sum++;
       if (responseFromServer != "")
       {
       //分析内容
       TextToFile(filename,responseFromServer);
       MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);
       foreach (Match m in mc)
       {
       newurl = m.Groups[1].Value;
       dirname = m.Groups[2].Value;
       int key = ++keyi;
       sql = insertdl + key + ",'" + dirname + "')";
       SaveSqls(sql);
       cururl = urlpri + newurl;
       curdir = "E:\\观2\\" + dirname;
       one(cururl, curdir,key);
       }
       SaveXmls();
       Executesql(); 
       
       this.textBox1.Text = sum.ToString();
       MessageBox.Show("采集成功!");
       }
       }
       public void one(string urlpri,string _dirname,int _key)
       {
       //要写入的文件路径
       filename = _dirname +"\\"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);
       if (!Directory.Exists(_dirname))
       {
       Directory.CreateDirectory(_dirname);
       }
       if (File.Exists(filename))
       {
       responseFromServer = FileToText(filename);
       }
       else
       {
       responseFromServer = ToServer(urlpri);
       }
       sum++;
       if (responseFromServer != "")
       {
       TextToFile(filename, responseFromServer);
       MatchCollection mc = Regex.Matches(responseFromServer, @"href=""\.\./(.*list.html)""[\s\S]*?《(.*?)》", RegexOptions.IgnoreCase);
       foreach (Match m in mc)
       {
       newurl = m.Groups[1].Value;
       dirname = m.Groups[2].Value;
       cururl = "http://www.zydg.net/magazine/" + newurl;
       curdir = _dirname + "\\" + dirname;
       two(cururl, curdir, _key);
       } 
       } 
       }
       public void two(string urlpri,string _dirname,int _key)
       {
       filename = urlpri.Substring(0, urlpri.LastIndexOf("/"));
       filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html";
       filename = _dirname + "\\" + filename;
       if (!Directory.Exists(_dirname))
       {
       Directory.CreateDirectory(_dirname);
       }
       if (File.Exists(filename))
       {
       responseFromServer = FileToText(filename);
       }
       else
       {
       responseFromServer = ToServer(urlpri);
       }
       sum++;
       if (responseFromServer != "")
       {
       TextToFile(filename, responseFromServer);
       Match mc = Regex.Match(responseFromServer, @"刊\s+期:(.*?)<br>[\s\S]*?编\s+辑:(.*?)<br>[\s\S]*?出\s+版: (.*?)<br>[\s\S]*?联系电话:(.*?)<br>[\s\S]*?E-mail: (.*?)<br>[\s\S]*?社\s+址:(.*?)<br>[\s\S]*?邮\s+编: (.*?)<br>[\s\S]*?邮发代号:(.*?)<br>[\s\S]*?国外发行代号: (.*?)<br>[\s\S]*?国际标准刊号:(.*?)<br>[\s\S]*?国内统一刊号: (.*?)</td>", RegexOptions.IgnoreCase);
       Match content = Regex.Match(responseFromServer, @"刊\s+物\s+简\s+介\s+:::...([\s\S]*?)...:::\s+收录期号列表", RegexOptions.Multiline);
       int key = ++keyj;
       sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" +
       mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" +
       mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')";
       SaveSqls(sql);
       MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'\s+target.*>(.*?)</a>", RegexOptions.IgnoreCase);
       foreach (Match m2 in mc2)
       {
       newurl = m2.Groups[1].Value;
       dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", "");
       cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
       curdir = _dirname + "\\" + dirname;
       three(cururl, curdir,key,dirname);
       } 
       
       } 
       }
       public void three(string urlpri,string _dirname,int _key,string qishu)
       {
       //要写入的文件路径
       filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);
       if (!Directory.Exists(_dirname))
       {
       Directory.CreateDirectory(_dirname);
       }
       if (File.Exists(filename))
       {
       responseFromServer = FileToText(filename);
       }
       else
       {
       responseFromServer = ToServer(urlpri);
       }
       sum++;
       if (responseFromServer != "")
       {
       TextToFile(filename, responseFromServer);
       Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase);
       string photoName = "";
       if (m.Groups[1].Value.Trim() != "")
       {
       photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value;
       DownFile(photoName, _dirname);
       
       }
       int key = ++keym;
       sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"\\"+ "face_" + m.Groups[1].Value + "')";
       SaveSqls(sql);
       MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(\d+.html?)'[\s\S]*?<font\s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase);
       foreach (Match m2 in mc2)
       {
       newurl = m2.Groups[1].Value;
       string muName = m2.Groups[3].Value;
       if (muName == "")
       {
       muName = mulu;
       }
       string lstr = m2.Groups[2].Value;
       string s1 = "";
       string s2 = "";
       if (lstr != "")
       {
       if (lstr.Contains("."))
       {
       s1 = lstr.Substring(0, lstr.IndexOf("."));
       s2 = lstr.Substring(lstr.LastIndexOf(".") + 1);
       }
       else
       {
       s1 = lstr;
       s2 = "";
       }
       int k2 = ++keyn;
       sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')";
       SaveSqls(sql);
       cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
       curdir = _dirname;
       four(cururl, curdir,k2);
       }
       mulu = muName;
       } 
       }
       } 
       public void four(string urlpri,string _dirname,int _key)
       {
       filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);
       if (!Directory.Exists(_dirname))
       {
       Directory.CreateDirectory(_dirname);
       }
       if (File.Exists(filename))
       {
       responseFromServer = FileToText(filename);
       }
       else
       {
       responseFromServer = ToServer(urlpri);
       }
       sum++;
       if (responseFromServer != "")
       {
       TextToFile(filename, responseFromServer);
       //分析内容
       Match m = Regex.Match(responseFromServer, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase);
       string content = m.Groups["text"].Value; //得到正文的所有内容
       string c = FilterScript(content);
       c = Remove(c); //得到过滤后的正文内容
       // Match ms = Regex.Match(c, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase);
       
       
       //设置要保存的XML 文件的名称
       string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));
       string pathxml = _dirname + "\\" + xmlname + "xml"; //将路径 和名字一起传过去
       Class1 cs = new Class1(_key, c, pathxml);
       cls.Add(cs);
       //序列化成功
       MatchCollection mc = Regex.Matches(responseFromServer, @"(<img\s+src=""(?<imgs>.*)""\s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);
       foreach (Match m2 in mc)
       {
       string imgurl = m2.Groups["imgs"].Value.Trim(); //得到单个图片的名称
       string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);
       if (imgurl != "")
       {
       string jurl = zhuurl + imgurl; //得到图片的绝对路径 
       DownFile(jurl, _dirname);
       
       }
       string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到单个PDF 的名称
       if (pdfurl != "")
       {
       string jurl = zhuurl + pdfurl; //得到 pdf 的绝对路径 
       DownFile(jurl, _dirname);
       
       }
       }
       }
       } 
       private void btnOK_Click(object sender, EventArgs e)
       {
       al = ReadIPproxy("e:\\test.txt");//初始化代理 IP
       HtmlSource("http://www.zydg.net/magazine/"); 
       }
       private void button1_Click(object sender, EventArgs e)
       {
       Application.Exit(); 
       }
       
       }
      }
  • 相关阅读:
    css flex布局实现后台页面
    html5 css iframe实现后台框架,仅用于学习案例
    nginx 多个网站配置
    nginx 负载 访问时 去掉端口
    nginx 负载
    解标准数独算法
    C++ execute linux cmd and retrieve the output
    C++ generate in Ubuntu
    shell操作典型案例--FTP操作
    PHP7 新写法
  • 原文地址:https://www.cnblogs.com/top5/p/1904822.html
Copyright © 2020-2023  润新知