前言:
本人一直使用网易云音乐播放器,对网易云音乐十分的热衷,里面的歌单功能非常便捷,能快速找到符合自己喜好的歌曲信息。此文章如有侵权,请留言即刻删除文章。
请求数据说明:以web请求的方式获取网易云音乐歌列表,歌单名称,链接,播放量既创建歌单人名。
一. 了解request请求
什么是request请求:当我们访问一个网站时所数据的网站地址就是一种request请求。请求的格式应遵照被请求方的合适要求。
例如用谷歌浏览器访问网易云音乐网址时:
按f12进入控制台 如图选中NetWork选项
请求头信息:包含请求地址,请求方式(Get /Post)请求网页类型Accept,最重要的:User-agent--访问浏览器版本,
二. 提供信息支持
打开Visual Studio=>文件=>新建项目=>新建控制台应用程序
选择控制台应用程序=>
选中项目右击=>添加=>类取名为RequestOptions:
RequestOptions:设置请求头信息参数
1 public class RequestOptions 2 { 3 /// <summary> 4 /// 请求方式 get post 5 /// </summary> 6 public string Method { get; set; } 7 /// <summary> 8 /// 请求地址 9 /// </summary> 10 public Uri Uri { get; set; } 11 /// <summary> 12 /// 上级历史记录链接 13 /// </summary> 14 public string Referer { get; set; } 15 /// <summary> 16 /// 请求超时时间 毫秒单位 17 /// </summary> 18 public int TimeOut = 5000;
19 20 /// <summary> 21 /// 启用长连接 22 /// </summary> 23 public bool KeepAlive = true;
24 25 /// <summary> 26 /// 禁止自动跳转 27 /// </summary> 28 public bool AllowAutoRedirect = false; 29 30 /// <summary> 31 /// 定义最大连接数 32 /// </summary> 33 public int ConntectionLimit = int.MaxValue; 34 /// <summary> 35 /// 请求次数 36 /// </summary> 37 public int RequestNum = 3; 38 /// <summary> 39 /// 可通过文件上传提交的文件类型 40 /// </summary> 41 public string Accept = "*/*"; 42 43 /// <summary> 44 /// 内容类型 45 /// </summary> 46 public string ContentType = "application/x-www-form-urlencoded"; 47 48 /// <summary> 49 /// 实例化头部信息 50 /// </summary> 51 public WebHeaderCollection header = new WebHeaderCollection(); 52 53 public WebHeaderCollection webHeader 54 { 55 get { return header; } 56 set { header = value; } 57 } 58 /// <summary> 59 /// 定义请求cookie字符串 60 /// </summary> 61 public string RequestCookies { get; set; } 62 /// <summary> 63 /// 异步请求数据 64 /// </summary> 65 public string XHRParams { get; set; } 66 67 }
新建RequestHelper类:写请求方法,并处理返回数据信息进行处理
public class RequestHelper { public static string RequestAction(RequestOptions options) { string result = string.Empty; IWebProxy proxy = null;//GetWebProxy(); var request = (HttpWebRequest)WebRequest.Create(options.Uri); request.Accept = options.Accept; request.ServicePoint.Expect100Continue = false; request.ServicePoint.UseNagleAlgorithm = false;//禁止Nagle算法加快载入速度 if (!string.IsNullOrEmpty(options.XHRParams)) { request.AllowWriteStreamBuffering = true; } else { request.AllowWriteStreamBuffering = false; }//禁止缓冲加快载入速度 request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定义gzip压缩页面支持 request.ContentType = options.ContentType;//定义文档类型及编码 request.AllowAutoRedirect = options.AllowAutoRedirect;//禁止自动跳转 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36";//设置User-Agent 伪装成goole chrome 浏览器 request.Timeout = options.TimeOut;//定义超时时间 request.KeepAlive = options.KeepAlive;//定义长连接 if (!string.IsNullOrEmpty(options.Referer)) request.Referer = options.Referer;//返回上一级历史连接 request.Method = options.Method; if (proxy != null) request.Proxy = proxy;//设置代理服务器IP,伪装请求地址 if (!string.IsNullOrEmpty(options.RequestCookies)) request.Headers[HttpRequestHeader.Cookie] = options.RequestCookies; request.ServicePoint.ConnectionLimit = options.ConntectionLimit; if (options.webHeader != null && options.webHeader.Count > 0) request.Headers.Add(options.webHeader); if (!string.IsNullOrEmpty(options.XHRParams)) { byte[] buffer = Encoding.UTF8.GetBytes(options.XHRParams); if (buffer != null) { request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); } } using (var response = (HttpWebResponse)request.GetResponse()) { if (response.ContentEncoding.ToLower().Contains("gzip"))//解压 { using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { result = reader.ReadToEnd(); } } } else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压 { using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { result = reader.ReadToEnd(); } } } else { using (Stream stream = response.GetResponseStream())//原始 { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { result = reader.ReadToEnd(); } } } } request.Abort(); return result; } private static IWebProxy GetWebProxy() { System.Net.WebProxy webProxy = null; try { // 代理链接地址加端口 string proxyHost = ""; string proxyPort = ""; // 代理身份验证的帐号跟密码 //string proxyUser = "xxx"; //string proxyPass = "xxx"; // 设置代理服务器 webProxy = new System.Net.WebProxy(); // 设置代理地址加端口 webProxy.Address = new Uri(string.Format("{0}:{1}", proxyHost, proxyPort)); // 如果只是设置代理IP加端口,例如192.168.1.1:80,这里直接注释该段代码,则不需要设置提交给代理服务器进行身份验证的帐号跟密码。 //webProxy.Credentials = new System.Net.NetworkCredential(proxyUser, proxyPass); } catch (Exception ex) { Console.WriteLine("获取代理信息异常", DateTime.Now.ToString(), ex.Message); } return webProxy; } }
现在通用的请求方法已经编写完毕,下面在Mian方法中调用上面编写的方法:
项目添加引用:HttpAgiltyPack支持
选择项目点击右键=>选择NuGet包管理:
搜索HtmlAgilityPack
选择安装即可,
此项目主要使用的Xpath选择匹配:参考文档:http://www.w3school.com.cn/xpath/xpath_syntax.asp
xpath可视化工具:HtmlAgilityPack Tester
链接:https://pan.baidu.com/s/1_in8Y9qFYzKQtnc-eLrb2w
提取码:co03
Mian方法:
static void Main(string[] args) { //设置请求路径 var uri = new Uri(@"https://music.163.com/discover/playlist/?cat=%E5%85%A8%E9%83%A8&order=hot"); //获取响应提文件 var simpleCrawlResult = RequestHelper.RequestAction(new RequestOptions() { Uri = uri, Method = "Get" }); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(simpleCrawlResult); //匹配选择到列表 HtmlNodeCollection playList = htmlDoc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[3]/div[1]/ul[1]").SelectNodes("li"); if (playList != null) { foreach (var playActicle in playList) { //匹配歌单名称 string playName = playActicle.SelectSingleNode("p[1]/a[1]/@title[1]").InnerText.ToString(); //歌单链接 string playHref = playActicle.SelectSingleNode("p[1]/a[1]").GetAttributeValue("href",""); //歌单播放量 string playCount = playActicle.SelectSingleNode("div[1]/div[1]/span[2]").InnerText.ToString(); //创建者 string createBy = playActicle.SelectSingleNode("p[2]/a[1]/@title[1]").InnerText.ToString(); //创建者链接地址 string createHref = playActicle.SelectSingleNode("p[2]/a[1]").GetAttributeValue("href", ""); //打印输出到控制台 Console.WriteLine("歌单:"+playName+" 链接:"+playHref+" 播放量:"+ playCount + " 创建者:"+createBy+" 创建者链接:"+createHref); Console.WriteLine("=============================="); } } Console.Read(); }
运行结果展示:
总结:
爬虫是批量获取信息的一种工具,方便快捷,大量数据。但有的网站进行了反爬虫处理,如果请求量过大,可能会导致ip被封,还有的网站进行了请求验证,需做验证,自行体会,爬虫虽好但不要侵犯他人隐私哦!