• 用正则表达式从网页里面提取视频地址


    //用正则表达式从网页里面提取视频地址

    //获得一个页面地址,拿到页面html,然后正则表达式去匹配视频地址

    //详细的看注释吧。


     1///<summary>
    2 /// 网页视频处理类
    3 ///</summary>
    6 ///<history>
    7 ///
    8 ///</history>
    9 public class WebVideo
    10 {
    11 ///<summary>
    12 /// 优酷、酷6、土豆等视频页面地址
    13 ///</summary>
    14 private string _pageUrl;
    15
    16 ///<summary>
    17 /// 是否启用页面压缩
    18 ///</summary>
    19 private bool _isCompressed;
    20
    21 ///<summary>
    22 /// 网站
    23 ///</summary>
    24 private VideoSite _site;
    25
    26
    27 public WebVideo ()
    28 {
    29 // TODO: Complete member initialization
    30 }
    31
    32
    33 ///<summary>
    34 /// 实例化WebVideo类
    35 ///</summary>
    36 ///<param name="pageUrl">视频页面地址</param>
    37 ///<param name="isCompressed">获取页面时是否启用压缩</param>
    38 public WebVideo ( string pageUrl, bool isCompressed )
    39 {
    40 // TODO: Complete member initialization
    41 this._pageUrl = pageUrl.Trim();
    42 this._isCompressed = isCompressed;
    43 this._site = this.GetSite(_pageUrl);
    44 }
    45
    46
    47 ///<summary>
    48 /// 根据Url地址得到网页的html源码
    49 /// (使用gzip,deflate压缩,延迟低)
    50 ///</summary>
    51 ///<param name="Url"></param>
    52 ///<returns></returns>
    53 public string GetWebContent ( string Url )
    54 {
    55 string strResult = "";
    56 try
    57 {
    58 Stream decompressedStream = null;
    59 //声明一个HttpWebRequest请求
    60 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
    61 request.Accept = "*/*";
    62 request.Headers.Set("Pragma", "no-cache");
    63 //设置连接超时时间
    64 request.Timeout = 9000;
    65 request.UserAgent = "TaoCaiSpider1.0 Kevin-Gu's spider";
    66 request.Headers.Add("Accept-Encoding", "gzip,deflate");
    67
    68 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    69
    70 string compressMode = response.ContentEncoding.ToLower();
    71 Console.WriteLine(compressMode);
    72
    73 if (compressMode == "gzip")
    74 {
    75 decompressedStream
    76 = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
    77 }
    78 else if (compressMode == "deflate")
    79 {
    80 decompressedStream
    81 = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
    82 }
    83 else
    84 {
    85 // 貌似只有优酷启用了页面压缩。。。
    86 decompressedStream = response.GetResponseStream();
    87 }
    88
    89 Encoding encode = Encoding.GetEncoding(response.CharacterSet);
    90 using (StreamReader streamReader = new StreamReader(decompressedStream, encode))
    91 {
    92 strResult = streamReader.ReadToEnd();
    93 }
    94 }
    95 catch (Exception ex)
    96 {
    97 Console.WriteLine("error occored:" + ex.Message);
    98 }
    99 return strResult;
    100 }
    101
    134
    135
    167
    168
    169 ///<summary>
    170 /// 使用正则表达式匹配获取视频文件地址
    171 ///</summary>
    172 ///<param name="htmlContent"></param>
    173 ///<returns></returns>
    174 public string GetVideoFileUrl (string htmlContent)
    175 {
    176 string[] rgxArr =new string[]{
    177 @"http://player\.youku\.com/player\.php/sid/[\w]{13}/v\.swf", //优酷的文件地址正则
    178 @"http://player\.ku6\.com/refer/[\w]{16}/v\.swf",
    179 @"http://js\.tudouui\.com/bin/player_online/[\w]+\.swf"
    180 };
    193
    194 Regex rgx;
    195
    196 // 使用不同的正则表达式来匹配视频文件地址
    197 switch (_site)
    198 {
    199 case VideoSite.YouKu:
    200 rgx = new Regex(rgxArr[0]);
    201 if (rgx.IsMatch(htmlContent))
    202 {
    203 return rgx.Match(htmlContent).ToString();
    204 }
    205 break;
    206 case VideoSite.TuDou:
    207 rgx = new Regex(rgxArr[2]);
    208 if (rgx.IsMatch(htmlContent))
    209 {
    210 return rgx.Match(htmlContent).ToString();
    211 }
    212 break;
    213 case VideoSite.Ku6:
    214 rgx = new Regex(rgxArr[1]);
    215 if (rgx.IsMatch(htmlContent))
    216 {
    217 return rgx.Match(htmlContent).ToString();
    218 }
    219 break;
    220 default:
    221 break;
    222 }
    223
    224 return string.Empty;
    225 }
    226
    227
    228 ///<summary>
    229 /// 获得视频网页中视频文件地址
    230 ///</summary>
    231 ///<returns></returns>
    232 public string GetVideoUrl ()
    233 {
    234 string videoUrl = string.Empty;
    235 if (_isCompressed)
    236 {
    237 string html = this.GetWebContent(_pageUrl);
    238 videoUrl = this.GetVideoFileUrl(html);
    239 }
    240 else
    241 {
    242 string html = this.GetHtmlWithoutCompress(_pageUrl);
    243 videoUrl = this.GetVideoFileUrl(html);
    244 }
    245 return videoUrl;
    246 }
    247
    248
    258
    259 }//end class
    260
    261
    262 ///<summary>
    263 /// 视频网站枚举
    264 ///</summary>
    265 public enum VideoSite
    266 {
    267 YouKu=0,
    268 Ku6=1,
    269 TuDou=2,
    270 };
    
    
    
    
    
    
  • 相关阅读:
    MYSQL数据库导入数据时出现乱码的解决办法
    Java Web(一) Servlet详解!!
    hibernate(九) 二级缓存和事务级别详讲
    MySQL(五) MySQL中的索引详讲
    LinkedHashMap源码详解
    hibernate(八) Hibernate检索策略(类级别,关联级别,批量检索)详解
    hibernate(七) hibernate中查询方式详解
    MySQL(四) 数据表的插入、更新、删除数据
    MySQL(三) 数据库表的查询操作【重要】
    MySQL(二) 数据库数据类型详解
  • 原文地址:https://www.cnblogs.com/scottgu/p/2230707.html
Copyright © 2020-2023  润新知