作为一个合理的爬虫程序,cookie和proxy是必须解决的问题, 相信很多朋友都遇到过类似问题。
wininet.dll中包含很多win32下和网络有关的函数,包括internet,ftp,cookie,Proxy等,比如百度知道和新浪微博的登陆信息可以保存N天,你在登陆后把系统时间改为2天后,登陆信息就失效了,使用InternetSetCookie可以自己设置过期日期。 首先在IE中登陆,登陆时选择信息保存2周,然后运行如下代码,运行之后你可以把日期调整到2012年看效果:
测试结果:应用以下代码不必担心cookie过期的问题,广大虫友们让你的爬虫强大起来吧!
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
using Common;
using mshtml;
namespace spider
{
public partial class WininetTest : Form
{
/// <summary>
/// 获取cookie
/// </summary>
[DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
public static extern bool InternetGetCookie(string url, string name, StringBuilder data, ref int dataSize);
/// <summary>
/// 设置cookie
/// </summary>
[DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);
public WininetTest()
{
InitializeComponent();
}
private void WininetTest_Load(object sender, EventArgs e)
{
//string url = "http://wenwen.soso.com/";
string url = "http://zhidao.baidu.com/";
// string url = "http://weibo.com/";
this.webBrowser.Navigate(url);
}
private void btnGetCookie_Click(object sender, EventArgs e)
{
this.txtCookie.Text = GetCookie();
}
private void btnSetCookie_Click(object sender, EventArgs e)
{
////删除旧的
foreach (string fileName in System.IO.Directory.GetFiles(System.Environment.GetFolderPath(Environment.SpecialFolder.Cookies)))
{
if (fileName.ToLower().IndexOf("zhidao") > 0)
{
System.IO.File.Delete("zhidao");
}
// if (fileName.ToLower().IndexOf("soso") > 0)
// {
// System.IO.File.Delete("soso");
// }
}
//生成新的
foreach (string c in GetCookie().Split(';'))
{
string[] item = c.Split('=');
if (item.Length == 2)
{
string name = item[0];
string value = item[1] + ";expires=Sun,22-Feb-2099 00:00:00 GMT";
InternetSetCookie(webBrowser.Url.ToString(), name, value);
this.txtNewCookie.Text += name + "=" + value + ";";
}
}
}
public string GetCookie()
{
//获取旧的
StringBuilder cookie = new StringBuilder(new String(' ', 2048));
int datasize = cookie.Length;
bool b = InternetGetCookie(webBrowser.Url.ToString(), null, cookie, ref datasize);
if (b)
{
return webBrowser.Document.Cookie;
}
return null;
}
private void btnSave_Click(object sender, EventArgs e)
{
string cookie = this.txtNewCookie.Text;
}
}
}
以下是proxy关键代码.
public class ProxyHelper
{
[DllImport("wininet.dll", SetLastError = true)]
private static extern bool InternetSetOption(IntPtr hInternet, int dwOption, IntPtr lpBuffer, int lpdwBufferLength);
public void RefreshIESettings(string strProxy)
{
const int INTERNET_OPTION_PROXY = 38;
const int INTERNET_OPEN_TYPE_PROXY = 3;
Struct_INTERNET_PROXY_INFO struct_IPI;
// Filling in structure
struct_IPI.dwAccessType = INTERNET_OPEN_TYPE_PROXY;
struct_IPI.proxy = Marshal.StringToHGlobalAnsi(strProxy);
struct_IPI.proxyBypass = Marshal.StringToHGlobalAnsi("local");
// Allocating memory
IntPtr intptrStruct = Marshal.AllocCoTaskMem(Marshal.SizeOf(struct_IPI));
// Converting structure to IntPtr
Marshal.StructureToPtr(struct_IPI, intptrStruct, true);
bool iReturn = InternetSetOption(IntPtr.Zero, INTERNET_OPTION_PROXY, intptrStruct, Marshal.SizeOf(struct_IPI));
}
}
由于soso问问的cookie是在服务端有独立的运行模式。目前没有找到合适的解决方案。