• 爬虫程序cookie和proxy的解决方法


    作为一个合理的爬虫程序,cookie和proxy是必须解决的问题, 相信很多朋友都遇到过类似问题。

    wininet.dll中包含很多win32下和网络有关的函数,包括internet,ftp,cookie,Proxy等,比如百度知道和新浪微博的登陆信息可以保存N天,你在登陆后把系统时间改为2天后,登陆信息就失效了,使用InternetSetCookie可以自己设置过期日期。 首先在IE中登陆,登陆时选择信息保存2周,然后运行如下代码,运行之后你可以把日期调整到2012年看效果:

    测试结果:应用以下代码不必担心cookie过期的问题,广大虫友们让你的爬虫强大起来吧!

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Runtime.InteropServices;
    using System.Text.RegularExpressions;
    using Common;
    using mshtml;

    namespace spider
    {
        public partial class WininetTest : Form
        {

            /// <summary>
            /// 获取cookie        

            /// </summary>
            [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
            public static extern bool InternetGetCookie(string url, string name, StringBuilder data, ref int dataSize);
            /// <summary>  
            /// 设置cookie  
            /// </summary>  
            [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
            public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);


            public WininetTest()
            {
                InitializeComponent();
            }

            private void WininetTest_Load(object sender, EventArgs e)
            {
               //string url = "http://wenwen.soso.com/";
               string url = "http://zhidao.baidu.com/";
              // string url = "http://weibo.com/";

         
                this.webBrowser.Navigate(url);


            }

            private void btnGetCookie_Click(object sender, EventArgs e)
            {
                this.txtCookie.Text = GetCookie();

            }

            private void btnSetCookie_Click(object sender, EventArgs e)
            {
                ////删除旧的
                foreach (string fileName in System.IO.Directory.GetFiles(System.Environment.GetFolderPath(Environment.SpecialFolder.Cookies)))
                {

                   if (fileName.ToLower().IndexOf("zhidao") > 0)
                    {

                        System.IO.File.Delete("zhidao");

                   }

                  //  if (fileName.ToLower().IndexOf("soso") > 0)
                   // {

                    //    System.IO.File.Delete("soso");

                  //  }

                }

                //生成新的  

                foreach (string c in GetCookie().Split(';'))
                {

                    string[] item = c.Split('=');
                    if (item.Length == 2)
                    {
                        string name = item[0];
                        string value = item[1] + ";expires=Sun,22-Feb-2099 00:00:00 GMT";
                        InternetSetCookie(webBrowser.Url.ToString(), name, value);
                        this.txtNewCookie.Text += name + "=" + value + ";";
                    }
                   
                }
               
            }

            public string GetCookie()
            {
                //获取旧的  

                StringBuilder cookie = new StringBuilder(new String(' ', 2048));
              
                int datasize = cookie.Length;

                bool b = InternetGetCookie(webBrowser.Url.ToString(), null, cookie, ref datasize);
                if (b)
                {
                    return webBrowser.Document.Cookie;
                }
                return null;
            }

            private void btnSave_Click(object sender, EventArgs e)
            {
                string cookie = this.txtNewCookie.Text;
            }

          }
    }

    以下是proxy关键代码.

     public class ProxyHelper
        {
            [DllImport("wininet.dll", SetLastError = true)]
            private static extern bool InternetSetOption(IntPtr hInternet, int dwOption, IntPtr lpBuffer, int lpdwBufferLength);

            public void RefreshIESettings(string strProxy)
            {
                const int INTERNET_OPTION_PROXY = 38;
                const int INTERNET_OPEN_TYPE_PROXY = 3;

                Struct_INTERNET_PROXY_INFO struct_IPI;

                // Filling in structure
                struct_IPI.dwAccessType = INTERNET_OPEN_TYPE_PROXY;
                struct_IPI.proxy = Marshal.StringToHGlobalAnsi(strProxy);
                struct_IPI.proxyBypass = Marshal.StringToHGlobalAnsi("local");

                // Allocating memory
                IntPtr intptrStruct = Marshal.AllocCoTaskMem(Marshal.SizeOf(struct_IPI));

                // Converting structure to IntPtr
                Marshal.StructureToPtr(struct_IPI, intptrStruct, true);

                bool iReturn = InternetSetOption(IntPtr.Zero, INTERNET_OPTION_PROXY, intptrStruct, Marshal.SizeOf(struct_IPI));
            }

        }

    由于soso问问的cookie是在服务端有独立的运行模式。目前没有找到合适的解决方案。

  • 相关阅读:
    PTA 程序设计题(数据结构第一章)
    (考研)计算机组成原理之计算机系统概论
    C语言复习
    vs2019 scanf 解决 C4996问题
    数据结构之链表
    数据结构之表、栈、队列
    数据结构之算法分析
    数据结构泛型之初接触
    数据结构之递归
    学习参考
  • 原文地址:https://www.cnblogs.com/nevergiveupblog/p/2088139.html
Copyright © 2020-2023  润新知