这篇这篇文章主要是展示了一个C#语言如何抓取网站中的图片。实现原理就是基于http请求。C#给我们提供了HttpWebRequest和WebClient两个对象,方便发送请求获取数据,下面看如何实
1,HttpGetAction方法。用于发送请求获取数据后处理字符串得到图片地址
1 public static void HttpGetAction(string url,string path,int name) 2 { 3 Stopwatch sw = new Stopwatch(); 4 sw.Start(); 5 Console.WriteLine("抓取地址:" + url); 6 string result = string.Empty; 7 HttpWebRequest webRequest = WebRequest.CreateHttp(url); 8 webRequest.Method = "GET"; 9 var response= webRequest.GetResponse(); 10 using (StreamReader reader = new StreamReader((response as HttpWebResponse).GetResponseStream(), Encoding.UTF8)) 11 { 12 result = reader.ReadToEnd(); 13 reader.Close(); 14 } 15 if (string.IsNullOrEmpty(result)) 16 { 17 Console.WriteLine("请求地址错误"); 18 Console.ReadKey(); 19 return; 20 } 21 //提取img标签src地址 22 Regex regImg = new Regex(@"<img[^<>]*?src[s ]*=[s ]*[""']?[s ]*(?<imgUrl>[^s ""'<>]*)[^<>]*?/?[s ]*>", RegexOptions.IgnoreCase); 23 // 搜索匹配的字符串 24 MatchCollection matches = regImg.Matches(result); 25 //爬取数量 26 int i = 0; 27 WebClient web = new WebClient(); 28 // 取得匹配项列表 29 foreach (Match match in matches) 30 { 31 string imgsrc = match.Groups["imgUrl"].Value; 32 if (imgsrc.Contains("http") && !imgsrc.Contains(".svg")) 33 { 34 i++; 35 HttpGetImg(web,imgsrc, path,name); 36 name++;//图片名 37 } 38 } 39 sw.Stop(); 40 Console.WriteLine("爬取完成!总共爬取了" + i + "张图片!"); 41 Console.WriteLine("爬取图片耗时:" + sw.ElapsedMilliseconds / 1000 + "秒"); 42 }
2,HttpGetImg方法。下载图片到指定目录
1 public static void HttpGetImg(WebClient web, string src,string path,int name) 2 { 3 Console.WriteLine("爬取图片:" + src); 4 if (!Directory.Exists(path)) 5 { 6 Console.WriteLine("路径错误!"); 7 Console.ReadKey(); 8 return; 9 } 10 web.DownloadFile(src, path+name+".jpg"); 11 Console.WriteLine("爬取图片成功:" + name+".jpg"); 12 }
3,控制台调用
1 static void Main(string[] args) 2 { 3 string url= "https://www.xxxxxx.com/"; 4 string path = Path.Combine(@"D:word 资料img冬天"); 5 HttpHelper.HttpGetAction(url,path,1); 6 Console.ReadKey(); 7 }
效果图:
一个简单的C#爬虫程序就完成了。如有错误的地方还望大神指点
原文来自:一个简单的C#程序-曾亚平个人博客