• [GO]并的爬取捧腹的段子


    package main
    
    import (
        "fmt"
        "strconv"
        "net/http"
        "regexp"
        "strings"
        "os"
    )
    
    //地址的规律
    //第一页:https://www.pengfu.com/xiaohua_1.html
    //第二页:https://www.pengfu.com/xiaohua_2.html
    //第三页:https://www.pengfu.com/xiaohua_3.html
    
    //查看每个页面的源码,可以看到标题的“<h1 class="dp-b"><a href="”有10个,它的实际搜索条件为<h1 class="dp-b"><a href="段子的地址"
    //进行到段子的地址后,有两个h1,就是标题,但是我在使用的时候发现网页里有两个地方使用了h1的标签,后面过滤一下,只
    //可能过滤到两个,只取一个
    //内容开头<div class="content-txt pt10"> 结尾<a id="prev" href="https://www.pengfu.com/content_1850587_1.html"></a>
    
    func HttpGet(url string) (result string, err error) {
        resp, err1 := http.Get(url)
        if err1 != nil {
            err = err1
            return
        }
        defer resp.Body.Close()  //关闭
    
        //读取网络内容
        buf := make([]byte, 4*1024)
        for true {
            n, _ := resp.Body.Read(buf)
            if n == 0 {
                break
            }
            result += string(buf[:n])//累加读取的内容
        }
        return
    }
    
    func SpiderOneJoy(url string) (title, content string, err error) {
        //爬取页面内容
        result, err1 := HttpGet(url)
        if err1 != nil {
            fmt.Println("SpiderOneJoy HttpGet err = ", err)
            err = err1
            return
        }
        //取关键信息,标题
        re1 := regexp.MustCompile(`<h1>(.*?)</h1>`)
        if re1 == nil {
            err = fmt.Errorf("%s", "SpiderOneJoy regexp.MustCompile err ")
            return
        }
        //取内容
        tmpTitle := re1.FindAllStringSubmatch(result, 1)//因为我只过滤第一个内容
        for _, data := range tmpTitle{
            title = data[1]
            title = strings.Replace(title, "
    ", "", -1)
            title = strings.Replace(title, "
    ", "", -1)
            title = strings.Replace(title, " ", "", -1)
            title = strings.Replace(title, "	", "", -1)
            break
        }
        //取关键信息,内容
        re2 := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev" href="`)
        if re2 == nil {
            err = fmt.Errorf("%s", "SpiderOneJoy regexp.MustCompile err ")
            return
        }
        //取内容
        tmpContent := re2.FindAllStringSubmatch(result, -1)
        for _, data := range tmpContent{
            content = data[1]
            content = strings.Replace(content, "
    ", "", -1)
            content = strings.Replace(content, "
    ", "", -1)
            content = strings.Replace(content, " ", "", -1)
            content = strings.Replace(content, "	", "", -1)
            content = strings.Replace(content, "	", "", -1)
            content = strings.Replace(content, "<br>", "", -1)
            content = strings.Replace(content, "<br/>", "", -1) //这种方式不太好,肯定有更好的办法
            break
        }
        return
    }
    
    func StoreJoyToFile(i int, fileTitile, fileContent []string) {
        //新建文件
        f, err := os.Create(strconv.Itoa(i) + ".txt")
        if err != nil {
            fmt.Println("os.Create err = ", err)
            return
        }
    
        defer f.Close()
    
        //写内容
        n := len(fileTitile)
        for i := 0; i < n; i++{
            //写标题
            f.WriteString(fileTitile[i] + "
    ")
            //写内容
            f.WriteString(fileContent[i] + "
    ")
            f.WriteString("--------------------------------------
    ")
        }
    }
    
    func SpiderPage(i int, page chan <- int)  {
        url := "https://www.pengfu.com/xiaohua_" + strconv.Itoa(i) + ".html"
        fmt.Printf("正在爬取%s
    ", url)
        
        //爬取页面内容
        result, err := HttpGet(url)
        if err != nil {
            fmt.Println("HttpGet err = ", err)
            return
        }
        //fmt.Println("r = ", result)
        //取内容<h1 class="dp-b"><a href="”有10个,它的实际搜索条件为<h1 class="dp-b"><a href="段子的地址",这里只需要地址内容
        re := regexp.MustCompile(`<h1 class="dp-b"><a href="(.*?)"`)
        if re == nil {
            fmt.Println("regexp.MustCompile err ")
            return
        }
        //解析表达式、并取关键信息
        joyUrls := re.FindAllStringSubmatch(result, -1)
    
        fileTitle := make([]string, 0)
        fileContent := make([]string, 0)
    
        //fmt.Println("url = ", joyUrls)
        //取网址,遍历后第一个返回下标,这里不要,第二个返回内容
        for _, data := range joyUrls{
            fmt.Println("data = ", data[1])
            //爬取每一个段子
            title, content, err := SpiderOneJoy(data[1])//它应该返回标题和内容,并可能会有报错,共三个返回值
            if err != nil {
                fmt.Println("SpiderOneJoy err ", err)
                continue
            }
            //fmt.Printf("title = #%v#
    ", title)//使用这种方式打印就能看到结果里是否包含一些空格
            //fmt.Printf("content = #%v#
    ", content)
            fileTitle = append(fileTitle, title) //将所有的标题放在一个切片里
            fileContent = append(fileContent, content) //将所有的内容放在一个内容里
        }
        //将内容写入到文件
        StoreJoyToFile(i, fileTitle, fileContent)
        page <- i //告诉程序是哪一页爬完了
    }
    
    func DoWork(start, end int)  {
        fmt.Printf("准备爬取第%d页到第%d页的网址
    ", start, end)
        page := make(chan int)
        for i:=start; i<=end; i++ {
            //定义一个函数,爬主页面
            go SpiderPage(i, page)
        }
        for i:=start; i<=end; i++ {
            fmt.Printf("第%d个页面爬取完成", <-page)
        }
    }
    
    func main() {
        var start, end int
        fmt.Printf("请输入起始页>=1:> ")
        fmt.Scan(&start)
        fmt.Printf("请输入结束页:> ")
        fmt.Scan(&end)
        DoWork(start, end)
    }
  • 相关阅读:
    spring事务管理器设计思想(一)
    ThreaLocal内存泄露的问题
    denyhost防止SSH暴力破解
    qt下调用win32api 修改分辨率
    Windows下pip安装包报错:Microsoft Visual C++ 9.0 is required Unable to find vcvarsall.bat
    Centos 6.5 下安装socket5代理
    Mac 下安装mitmproxy
    Mac OS 下安装wget
    判断客户端是否使用代理服务器及其匿名级别
    Mac OS 下安装rar unrar命令
  • 原文地址:https://www.cnblogs.com/baylorqu/p/9707620.html
Copyright © 2020-2023  润新知