基础知识:
HTTP数据传输基本知识
想要制作一个具有登录功能的爬虫,就要了解HTTP Header的重要性。还要对针对的站点具有一定的了解
什么是登录?
将 “帐号”“密码” 发送至站点的登录URL地址,即为登录。
登录多为POST请求,如果想研究对象站点请多留意该站点POST请求。
登录信息多存在于 POST DATA 中,但也可以保存在Cookies中
登录信息不只限于 “帐号”“密码” 还会有 “登录时间”“验证码” 或更多特有内容,请多留意。
HTTP Header的重要性
HTTP Header中记录了很多数据,其中包括请求类型、编码格式、浏览器标识、主机标识等等。更重要的是HTTP Header可以用来区分客户端(比如是否是个爬虫)
尽量将目标URL中的HTTP Header都写入HttpWebRequest对象中。
HTTP Header 存放在HttpWebRequest中。
特定的HTTP Header可以手动添加
1 1 requestPost.Headers.Add("X-Requested-With", "XMLHttpRequest");
HTTP Header作用(具体解释请自行查找)
Connection:需要根据目标站点选择是否保留
ContentLength:不可用于GET请求
UserAgent:用户标识,用来标识是否是同一用户(具有验证码站点需要注意该键统一)
Referer:用来记录前一个URL地址(多用于跳转统计或跳转后验证)
Host:域名
POST DATA:
//写入POST DATA
1 byte[] postdatabytes = encoding.GetBytes(postdata); 2 //request.ContentLength = postdatabytes.Length; 3 Stream stream; 4 stream = request.GetRequestStream(); 5 stream.Write(postdatabytes, 0, postdatabytes.Length); 6 stream.Close();
Cookies
Cookies分为请求Cookies和写入Cookies,多数由服务端写入本地或JS写入本地,再提交服务端。
Cookies多为动态,可用于检测爬虫
服务端写入的Cookies可以从HttpWebResponse中获取
HttpWebRequest请求
当你解决了以上所有后,就需要使用HttpWebRequest来发送你的请求。
HttpWebRequest可以承载HTTP Header、Cookies和POST DATA
HttpWebRequest 可以设置请求类型(POST/GET)
HttpWebRequest 可以设置是否保持长连接
HttpWebRequest 可以设置是否跟随目标地址跳转
HttpWebRequest可以创建HttpWebResponse
1 //设置请求类型 2 request.Method = "POST"/"GET"; 3 //设置是否长连接 4 request.KeepAlive = true/false; 5 //设置是否跟随跳转 6 request.AllowAutoRedirect = true/false;
HttpWebResponse接收对象
HttpWebResponse存放了HttpWebRequest请求后接收到的Cookies
HttpWebResponse存放了HttpWebRequest请求后接收到的HTML
至此一个模拟登录的爬虫必备条件全部介绍完毕。
下面放几个辅助开发的工具介绍
Html Agility Pack:一个可以将HTML作为XML处理的工具。支持远程加载、本地加载、文件加载,支持HTML、支持XML
Firebug:可以捕捉所有页面请求,便于分析站点、查看HTTP Header
1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Text; 5 using System.Net; 6 using System.IO; 7 using System.Xml; 8 using System.IO.Compression; 9 using System.Drawing; 10 11 namespace Common 12 { 13 public class HttpWebUtility 14 { 15 #region 属性 16 /// <summary> 17 /// HttpWebRequest请求后返回的Html 18 /// </summary> 19 public string ResultHtml 20 { 21 get; 22 set; 23 } 24 /// <summary> 25 /// 若要从远程调用中获取COOKIE一定要为request设定一个CookieContainer用来装载返回的cookies 26 /// </summary> 27 public CookieContainer CookieContainer 28 { 29 get; 30 set; 31 } 32 /// <summary> 33 /// Cookies 字符串 34 /// </summary> 35 public string CookiesString 36 { 37 get; 38 set; 39 } 40 #endregion 41 42 #region 方法 43 /// <summary> 44 /// HttpWeb请求方法 POST OR GET 45 /// </summary> 46 /// <param name="request">Post OR Get的头信息对象</param> 47 /// <param name="isPost">是否为POST</param> 48 /// <param name="postdata">若为POST时要发送的数据字符串</param> 49 /// <param name="encodingName">Post以GBK编码 默认为UTF8</param> 50 public void Request(HttpWebRequest request, bool isPost, string postdata = "", string encodingName = "UTF8", bool IsGZipString = false, bool AllowAutoRedirect = false) 51 { 52 HttpWebResponse response = null; 53 try 54 { 55 // 必须对request进行实例化 56 if (request == null) 57 throw new Exception("HttpWebPost Error:request = NULL"); 58 request.Method = isPost ? "POST" : "GET"; 59 request.KeepAlive = true; 60 request.AllowAutoRedirect = AllowAutoRedirect; 61 //检测Cookie 并同仁给request 62 if (CookieContainer != null) 63 request.CookieContainer = CookieContainer; 64 else 65 { 66 request.CookieContainer = new CookieContainer(); 67 CookieContainer = request.CookieContainer; 68 } 69 70 Encoding encoding = Encoding.UTF8; 71 if (encodingName != "UTF8") 72 { 73 encoding = Encoding.GetEncoding(encodingName); 74 } 75 //若是POST 提交数据 76 if (isPost) 77 { 78 byte[] postdatabytes = encoding.GetBytes(postdata); 79 //if (request.ContentLength <= 0) 80 //request.ContentLength = postdatabytes.Length; 81 Stream stream; 82 stream = request.GetRequestStream(); 83 stream.Write(postdatabytes, 0, postdatabytes.Length); 84 stream.Close(); 85 } 86 //接收响应 87 //try 88 //{ 89 // response = (HttpWebResponse)request.GetResponse(); 90 //} 91 //catch (WebException ex) 92 //{ 93 // response = (HttpWebResponse)ex.Response; 94 //} 95 //接收响应 96 response = (HttpWebResponse)request.GetResponse(); 97 98 //获取并保存返回cookie 99 response.Cookies = request.CookieContainer.GetCookies(request.RequestUri); 100 string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri); 101 CookiesString = strcrook; 102 CookieContainer = request.CookieContainer; 103 104 //response.Headers.ToString().ToLower().Contains("transfer-encoding: chunked") 105 //获取并保存 返回内容 106 using (Stream streamReceive = response.GetResponseStream()) 107 { 108 if (IsGZipString) //解压缩返回流... 109 { 110 using (GZipStream zipStream = new GZipStream(streamReceive, CompressionMode.Decompress)) 111 using (StreamReader sr = new StreamReader(zipStream, encoding)) 112 ResultHtml = sr.ReadToEnd(); 113 } 114 else 115 { 116 using (StreamReader sr = new StreamReader(streamReceive, encoding)) 117 ResultHtml = sr.ReadToEnd(); 118 } 119 } 120 121 //关闭与释放 122 request.Abort(); 123 response.Close(); 124 } 125 catch (Exception ex) 126 { 127 throw new Exception("HttpWebPost Error:" + ex.Message); 128 } 129 } 130 131 private string GetGZipStreamString(WebResponse response) 132 { 133 string result = ""; 134 135 return result; 136 } 137 /// <summary> 138 /// 单独的 一个Get请求, 与Cookie无关 139 /// </summary> 140 /// <param name="url"></param> 141 /// <returns></returns> 142 public string Request_Alone(string url) 143 { 144 //设置头信息 145 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 146 request.Credentials = CredentialCache.DefaultCredentials; 147 request.ContentType = "application/x-www-form-urlencoded"; 148 request.AllowAutoRedirect = false; 149 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 150 //获取并保存 返回内容 151 StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")); 152 return sr.ReadToEnd(); 153 } 154 155 /// <summary> 156 /// 单独的 一个Get请求, 与Cookie无关 157 /// </summary> 158 /// <param name="url"></param> 159 /// <returns></returns> 160 public string Request_Alone_Keywords(string URL) 161 { 162 Encoding encoding = Encoding.UTF8; 163 //设置头信息 164 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL); 165 request.Credentials = CredentialCache.DefaultCredentials; 166 request.Accept = "text/html, application/xhtml+xml, */*"; 167 request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0)"; 168 request.AllowAutoRedirect = false; 169 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 170 //获取并保存 返回内容 171 StreamReader sr = new StreamReader(response.GetResponseStream(), encoding); 172 return sr.ReadToEnd(); 173 } 174 175 /// <summary> 176 /// HttpWeb请求方法 POST OR GET 177 /// </summary> 178 /// <param name="request">Post OR Get的头信息对象</param> 179 /// <param name="isPost">是否为POST</param> 180 /// <param name="postdata">若为POST时要发送的数据字符串</param> 181 /// <param name="encodingName">Post以GBK编码 默认为UTF8</param> 182 public string Request_Alone_Post(HttpWebRequest request, bool isPost, string postdata = "", string encodingName = "UTF8", bool IsGZipString = false) 183 { 184 string _ResultHtml = ""; 185 HttpWebResponse response = null; 186 try 187 { 188 // 必须对request进行实例化 189 if (request == null) 190 throw new Exception("HttpWebPost Error:request = NULL"); 191 request.Method = isPost ? "POST" : "GET"; 192 request.KeepAlive = true; 193 request.AllowAutoRedirect = false; 194 //检测Cookie 并同仁给request 195 request.CookieContainer = new CookieContainer(); 196 //if (CookieContainer != null) 197 // request.CookieContainer = CookieContainer; 198 //else 199 //{ 200 // CookieContainer = request.CookieContainer; 201 //} 202 203 Encoding encoding = Encoding.UTF8; 204 if (encodingName != "UTF8") 205 { 206 encoding = Encoding.GetEncoding(encodingName); 207 } 208 //若是POST 提交数据 209 if (isPost) 210 { 211 byte[] postdatabytes = encoding.GetBytes(postdata); 212 request.ContentLength = postdatabytes.Length; 213 Stream stream; 214 stream = request.GetRequestStream(); 215 stream.Write(postdatabytes, 0, postdatabytes.Length); 216 stream.Close(); 217 } 218 219 //接收响应 220 response = (HttpWebResponse)request.GetResponse(); 221 222 //获取并保存返回cookie 223 //response.Cookies = request.CookieContainer.GetCookies(request.RequestUri); 224 //string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri); 225 //CookiesString = strcrook; 226 //CookieContainer = request.CookieContainer; 227 228 //response.Headers.ToString().ToLower().Contains("transfer-encoding: chunked") 229 //获取并保存 返回内容 230 using (Stream streamReceive = response.GetResponseStream()) 231 { 232 if (IsGZipString) //解压缩返回流... 233 { 234 using (GZipStream zipStream = new GZipStream(streamReceive, CompressionMode.Decompress)) 235 using (StreamReader sr = new StreamReader(zipStream, encoding)) 236 _ResultHtml = sr.ReadToEnd(); 237 } 238 else 239 { 240 using (StreamReader sr = new StreamReader(streamReceive, encoding)) 241 _ResultHtml = sr.ReadToEnd(); 242 } 243 } 244 245 //关闭与释放 246 request.Abort(); 247 response.Close(); 248 return _ResultHtml; 249 } 250 catch (Exception ex) 251 { 252 throw new Exception("HttpWebPost Error:" + ex.Message); 253 } 254 } 255 256 /// <summary> 257 /// 获取验证码图片 258 /// </summary> 259 /// <param name="url"></param> 260 /// <returns></returns> 261 public Bitmap Request_GetBitmap(HttpWebRequest request, bool IsGZipString = false, bool IsAllowAutoRedirect = false) 262 { 263 HttpWebResponse response = null; 264 try 265 { 266 // 必须对request进行实例化 267 if (request == null) 268 throw new Exception("HttpWebPost Error:request = NULL"); 269 request.Method = "GET"; 270 request.KeepAlive = true; 271 request.AllowAutoRedirect = IsAllowAutoRedirect; 272 //检测Cookie 并同仁给request 273 if (CookieContainer != null) 274 request.CookieContainer = CookieContainer; 275 else 276 { 277 request.CookieContainer = new CookieContainer(); 278 CookieContainer = request.CookieContainer; 279 } 280 Encoding encoding = Encoding.UTF8; 281 //接收响应 282 response = (HttpWebResponse)request.GetResponse(); 283 string html = this.ResultHtml; 284 285 //获取并保存返回cookie 286 response.Cookies = request.CookieContainer.GetCookies(request.RequestUri); 287 string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri); 288 CookiesString = strcrook; 289 CookieContainer = request.CookieContainer; 290 //获取并保存 返回内容 291 Bitmap bitmap = null; 292 using (Stream streamReceive = response.GetResponseStream()) 293 { 294 if (IsGZipString) //解压缩返回流... 295 { 296 using (GZipStream zipStream = new GZipStream(streamReceive, CompressionMode.Decompress)) 297 bitmap = new Bitmap(zipStream); 298 } 299 else 300 bitmap = new Bitmap(streamReceive); 301 } 302 303 //关闭与释放 304 request.Abort(); 305 response.Close(); 306 return bitmap; 307 } 308 catch (Exception ex) 309 { 310 throw new Exception("HttpWebPost Error:" + ex.Message); 311 } 312 } 313 #endregion 314 315 } 316 }
必须:连接对象 HttpWebRequest // 设置:是否长连接、是否跟随跳转 // 实例化:连接对象Cookies池 模拟登陆或模拟提交 // HTTP Header
必须:接收对象HttpWebResponse // 设置:接收对象Cookies request对象获得的cookies 或者自写 // 读取数据:response.GetResponseStream()