• F#百度新闻


    #if INTERACTIVE
    #r @"C:\Users\v-shuzhu\Desktop\HtmlAgilityPack.dll"
    #endif
    open System
    open System.Diagnostics
    open System.Net
    open System.Xml
    open System.IO
    open HtmlAgilityPack
       
    let asyncGrapUrl(newUrl : string) =
        async{            
            let fileNameXml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml"
            let fileNameHtml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".html"
            let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest
            let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync())
            let responStream = httpRespon.GetResponseStream()
    
            let xml = new HtmlDocument()
            xml.Load(responStream,Text.Encoding.GetEncoding("gb2312"),true)
            xml.OptionOutputAsXml <- true
    
    
            return xml//,xml)
        } |> Async.RunSynchronously   
         
    
    //let url = @"http://feed.cnblogs.com/blog/sitehome/rss"
    let url = @"http://www.news.baidu.com"
    
    let html = asyncGrapUrl(url) 
    let htmlnode = html.DocumentNode
    
    let content = html.GetElementbyId("container")
    let dls = content.SelectNodes("/html[1]/body[1]/div[3]/div[3]/div/div/dl")    
    let dds = dls |> Seq.map(fun dl -> dl.ChildNodes) |> Seq.collect(fun dl -> dl)  
         
    let yyc = 
        dds 
        |> Seq.map(fun dd -> dd.ChildNodes)
        |> Seq.collect (fun dd -> dd) 
        |> Seq.map(fun ui -> ui.ChildNodes) 
        |> Seq.collect (fun li -> li)         
        |> Seq.toList
        |> List.map(fun li -> li.InnerText)
        |> List.filter(fun txt -> not (txt.Trim() = ""))    
        |> List.map(fun str -> 
                            System.Text.RegularExpressions.Regex.Replace(str,"\n[ ]*", System.Environment.NewLine))    
        |> List.fold (+) ""
        |> fun str -> System.Text.RegularExpressions.Regex.Replace(str,@"^[ \t\n]*\n", "")
    
    let fileNameTxt = @"D:\" + url.Replace('.','0').Replace('/','0').Replace(':','0') + ".txt"
    let fileInfo = new System.IO.FileStream(fileNameTxt,FileMode.OpenOrCreate,FileAccess.ReadWrite)
    let writer = new System.IO.StreamWriter(fileInfo,Text.Encoding.UTF8)
    writer.WriteLine(yyc)

    还不完善,先保存下。。。
    目前可以把百度里面的新闻标题抓出来,当然要抓这些标题的链接是很简单的,下一步实现选择标题,抓取相应标题的具体内容。。。

  • 相关阅读:
    javaweb-番外篇-Commons-FileUpload组件上传文件
    javaweb-3-在Eclipse中引入Tomcat
    javaweb-2-Tomcat初步学习与使用
    javaweb-1-B/S初论
    jdk安装与配置
    程序、计算机程序、java初论
    RPC原理及RPC实例分析
    java堆排序(大根堆)
    数据结构——堆(Heap)大根堆、小根堆
    Spring事务传播机制和数据库隔离级别
  • 原文地址:https://www.cnblogs.com/FsharpZack/p/2846465.html
Copyright © 2020-2023  润新知