• 爬虫软件开发要用到的代码


    1.时间戳转为C#格式时间

            /// <summary>
            /// 时间戳转为C#格式时间
            /// </summary>
            /// <param name="timeStamp">Unix时间戳格式</param>
            /// <returns>C#格式时间</returns>
            private DateTime GetTime(string timeStamp)
            {
                DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1));
                long lTime = long.Parse(timeStamp + "0000");
                TimeSpan toNow = new TimeSpan(lTime);
                return dtStart.Add(toNow);
            }

    2.获取验证码

     public Image GetImg(string url)
            {
                if (string.IsNullOrWhiteSpace(url))
                {
                    url = string.Format("http://ms.baihe.com/checkcode/defaultImageService?0.{0}", DateTime.Now.Ticks);
                }
    
                cookieCheckCode = "";
    
                var item = new HttpItem()
                {
                    URL = url,
                    //Encoding = System.Text.Encoding.GetEncoding("GBK"),
                    Method = "get",
                    //IsToLower = false,
                    Expect100Continue = false,//代理时用这个
                    Cookie = _cookie,
                    Timeout = 100000,
                    ReadWriteTimeout = 30000,
                    UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0",
                    ContentType = "text/html",
                    ResultType = ResultType.Byte
                };
                var result = http.GetHtml(item);
    
                if (result.Cookie != null)
                    cookieCheckCode += result.Cookie.StartsWith("PHPSESSID")
                        ? result.Cookie.Substring(result.Cookie.IndexOf(',') + 1)
                        : result.Cookie;
    
                return byteArrayToImage(result.ResultByte);
            }
    
    
            private Image byteArrayToImage(byte[] Bytes)
            {
                using (var ms = new MemoryStream(Bytes))
                {
                    return Bitmap.FromStream(ms, true);
                }
            }

    3.正则表使用类

     Regex re = new Regex(@"&uid=(d+)"" class=""yellow"">(.+)</a>", RegexOptions.None);
                MatchCollection mc = re.Matches(resulthtml.Html);
    
                var list = new List<User>();
                foreach (Match match in mc)
                {
                    var uid = match.Groups[1].Value;
                    var nick = match.Groups[2].Value;
                    list.Add(new User { uid = uid, nickname = nick });
                }

    4.获取时间

            public long GetTimeLikeJS()
            {
                var st = new DateTime(1970, 1, 1);
                var t = (DateTime.Now.ToUniversalTime() - st);
                return (long)t.TotalMilliseconds;
            }

    5.另一线程操作主线程的控件

               this.Invoke((Action)delegate()
                        {
                            this.textBox1.Text = "登录成功";
                        });

     .net 2.0里的匿名委托

     this.Invoke((EventHandler)delegate  
                {  
                    button.Text = i.ToString();  
                });  

    另:BackgroundWorker 组件用来执行诸如数据库事务、文件下载等耗时的异步操作

    6.Json对象转换

     JArray array = JArray.Parse(json);
                JToken token = array[0];
                var total = token["total"];
                var users = token["list"].Children();
                foreach (var item in users)
                {
                    var user = JsonConvert.DeserializeObject<UserInfo>(item.ToString());
                    int age = user.age;
                }

    7.赶集网登录

    HttpHelper http = new HttpHelper();
                HttpItem item = null;
                item = new HttpItem()
                {
                    URL = string.Format("https://passport.ganji.com/login.php?callback=jQuery{0}&username={1}&password={2}",GetTime(),textBox2.Text,textBox3.Text),//URL                
                    Referer = "https://passport.ganji.com/login.php?next=/",//来源URL     可选项  
                };
                item.Header.Add("x-requested-with", "XMLHttpRequest");  //主要就是这一句,赶集服务器只接受ajax请求。
                HttpResult result = http.GetHtml(item);            
                string cookie = result.Cookie.Replace("path=/;", "").Replace(",", "%2c");
     
                //登录成功访问我发布的信息测试
                item = new HttpItem()
                {
                    URL = "http://www.ganji.com/vip/my_post_list.php",//URL   
                    Method = "get",//URL     可选项 默认为Get   
                    Cookie = cookie,//字符串Cookie     可选项   
                };
                result = http.GetHtml(item);
                string html = result.Html;
     
                textBox1.Text = html + "
    " ;

    8.C# Unicode编码/解码

    http://www.cnblogs.com/Rolends/archive/2011/09/22/2185276.html

    //如果post josn 乱码或服务器不能正常接收
    HttpItem item = new HttpItem()
                {
                    URL = url,// "http://159.142.15.196:8089/api/Users/Post_ErpUsers",//URL     必需项     
                    Method = "post",//URL     可选项 默认为Get   
                    IsToLower = false,//得到的HTML代码是否转成小写     可选项默认转小写   
                    Cookie = "",//字符串Cookie     可选项   
                    Referer = "",//来源URL     可选项   
                    //  Postdata = json, //System.Web.HttpUtility.UrlEncode(json, Encoding.UTF8),//Post数据     可选项GET时不需要写   
                    Timeout = 100000,//连接超时时间     可选项默认为100000    
                    ReadWriteTimeout = 30000,//写入Post数据超时时间     可选项默认为30000   
                    UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",//用户的浏览器类型,版本,操作系统     可选项有默认值   
                    ContentType = "application/json",//返回类型    可选项有默认值   
                    Allowautoredirect = false,//是否根据301跳转     可选项 
                    PostdataByte = UTF8Encoding.UTF8.GetBytes(json),
                    PostDataType = PostDataType.Byte
                };

    9.C#解析Html组件

    组件名称:HtmlAgilityPack

    xpath获取元素:"//*[@class="dc-intro"]/ul/li"  ,查找class="dc-intro"下ul下的li

                HtmlWeb web = new HtmlWeb();
                var doc = web.Load(string.Format("{0}/soft/{1}.html", baseUrl, id));
                var node = doc.GetElementbyId("form2");
                var titleNode = node.SelectSingleNode("//*[@class="rr-title dc-title clearfix"]/h1");
                string title = titleNode.InnerText;
    
                string kfyy = node.SelectNodes("//*[@class="dc-intro"]/ul/li")[0].LastChild.InnerText;//开发语言
                string slsj = node.SelectNodes("//*[@class="dc-intro"]/ul/li")[1].LastChild.InnerText;//收录时间
                string sqxy = node.SelectNodes("//*[@class="dc-intro"]/ul/li")[2].LastChild.InnerText;//授权协议
                string czxt = node.SelectNodes("//*[@class="dc-intro"]/ul/li")[3].LastChild.InnerText;//支持的操作系统
                string rjpjHtml = node.SelectNodes("//*[@class="dc-intro"]/ul/li")[4].InnerHtml;//软件评级
    
                var level = Regex.Matches(rjpjHtml, "xx01.png").Count;//级数
    
                string content = node.SelectSingleNode("//*[@class="markdown-body entry-content"]").InnerHtml;
    
                string rjsy = node.SelectNodes("//*[@class="dc-info"]/a")[0].Attributes["href"].Value;
                string rjxz = node.SelectNodes("//*[@class="dc-info"]/a")[1].Attributes["href"].Value;    

     10.上传文件

            private static void UploadFile()
            {
                HttpHelper Http = new HttpHelper();
    
                string cookieStr = File.ReadAllText(@"E:下载UploadImageFileUploadImageFilebs_cookie.txt");
    
                #region 变量
                byte[] UploadBuffers = null;
                string BoundStr = "----WebKitFormBoundarynp7wXmB7Ntr1BcsX";//根据抓包生成
                StringBuilder UploadBuf = new StringBuilder();
                #endregion
    
                #region 头部数据
                UploadBuf.Append("--" + BoundStr + "
    ");
                UploadBuf.Append(@"Content-Disposition: form-data; name=""uid""" + "
    
    1
    ");
                UploadBuf.Append("--" + BoundStr + "
    " + @"Content-Disposition: form-data; name=""hash""" + "
    
    ");
                UploadBuf.Append("dd865aaa4760a2715e5c5660754f7a7f");
                UploadBuf.Append("
    --" + BoundStr + "
    ");
                UploadBuf.Append(@"Content-Disposition: form-data; name=""Filedata""; filename=""test.txt""" + "
    ");
                UploadBuf.Append("Content-Type: text/plain
    
    ");
                byte[] HeadBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString());
                #endregion
    
                #region 图片数据
                byte[] PicBytes = File.ReadAllBytes(@"c:	est.txt"); //System.Text.Encoding.Default.GetBytes();
                #endregion
    
                #region 尾部数据
                UploadBuf.Clear();
                UploadBuf.Append("
    --" + BoundStr + "--
    ");
                byte[] TailBytes = Encoding.ASCII.GetBytes(UploadBuf.ToString());
                #endregion
    
                #region 数组拼接
                UploadBuffers = ComposeArrays(HeadBytes, PicBytes);
                UploadBuffers = ComposeArrays(UploadBuffers, TailBytes);
                #endregion
    
                var item = new HttpItem()
                  {
                      URL = "http://www.xxx.com/misc.php?mod=swfupload&operation=upload&simple=1",
                      Method = "POST",
                      ContentType = "multipart/form-data; boundary=" + BoundStr,
                      PostDataType = PostDataType.Byte,
                      PostEncoding = Encoding.UTF8,
                      PostdataByte = UploadBuffers,
                      ResultType = ResultType.String,
                      Cookie = cookieStr
                  };
                var result = Http.GetHtml(item);
    
                var a = result.Html;
            }
    
            public static byte[] ComposeArrays(byte[] Array1, byte[] Array2)
            {
                byte[] Temp = new byte[Array1.Length + Array2.Length];
                Array1.CopyTo(Temp, 0);
                Array2.CopyTo(Temp, Array1.Length);
                return Temp;
            }
    

      

  • 相关阅读:
    Python——协程
    Linux——raid介绍
    Linux——网关介绍
    Linux——inode节点介绍
    算法:排序加二分查找
    Mysql主从复制作用和工作原理详解
    Selenium中错误:selenium.common.exceptions.ElementClickInterceptedException: Message: element click inte
    redis.exceptions.DataError: Invalid input of type: 'dict'. Convert to a byte, string or number first
    Python之requests错误Expecting value: line 1 column 1 (char 0)
    简述cookies和session的区别
  • 原文地址:https://www.cnblogs.com/bqh10086/p/5022207.html
Copyright © 2020-2023  润新知