• 正则抓取网页所有href和src


    根据抓取的页面,用正则来匹配页面href和src

    string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";
        string ContentType = "";
    
        Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");
        protected void Application_BeginRequest(object sender, EventArgs e)
        {
    
            Uri u = new Uri(strReqUrl, Request.RawUrl);
            byte[] b = getVerificationCode(u);
    
            //MemoryStream ms = new MemoryStream(b);
            //Response.ClearContent();
            //Response.ContentType = ContentType;
            //Response.BinaryWrite(b);
    
            StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));
            StringBuilder sb = new StringBuilder();
            GetHtmlUrl(ref strHtml);
            Response.Write(strHtml.ToString());
            Response.End();
        }
        public byte[] getVerificationCode(Uri url)
        {
            WebClient MyWebClient = new WebClient();
            MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            MyWebClient.Headers.Add("Accept-Language", "    zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
            MyWebClient.Headers.Add("User-Agent", this.UserAgent);
            MyWebClient.Credentials = CredentialCache.DefaultCredentials;
            try
            {
                Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);
                ContentType = MyWebClient.ResponseHeaders["Content-Type"];
                return (pageData);
            }
            catch
            {
                return null;
            }
        }
    View Code
        void GetHtmlUrl(ref StringBuilder strHtml)
        {
            //string headstr = "(src|href)=", endstr = "(")";
            //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")";
    
            string reg = "(src|href)\s*=\s*(?:"(?<1>[^"]*)"|(?<1>\S+))";
            Regex r = new Regex(reg, RegexOptions.None);
            Match match = r.Match(strHtml.ToString());
            StringBuilder sb = new StringBuilder();
            while (match.Success)
            {
                //sb.Append(match.Groups["url"].Value + "
    ");//得到href值                
                //sb.Append(match.Groups["text"].Value + "
    ");//得到<a><a/>中间的内容     
    
                sb.Append(match + "
    ");//得到href值     
                match = match.NextMatch();
                //try
                //{
                //    Uri u = new Uri(strReqUrl, match.Value.Replace(""", "").Replace("'", ""));
                //    strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));
                //}
                //catch
                //{
                //}
            }
        }
  • 相关阅读:
    shell 实现word count
    jvm 参数调优
    Java注解处理器(转)
    JVM -XX: 参数介绍(转)
    如何在Linux下重命名多个文件
    Kafka学习之broker配置(0.8.1版)(转)
    linux 历史命令用法(转)
    hive-site.xml 参数设置
    Hadoop-2.2.0 + Hbase-0.96.2 + Hive-0.13.1(转)
    正则表达式通过Unicode属性匹配
  • 原文地址:https://www.cnblogs.com/xuxiaoshuan/p/3817662.html
Copyright © 2020-2023  润新知