• 幽灵蛛(pholcus)规则(二)



    此文档主要涉及到页面解析,goquery的使用方法,我觉得可以专门写篇文档。
    package pholcus_lib
    
    
    // 基础包
    import (
        "github.com/henrylee2cn/pholcus/common/goquery"                          //DOM解析
        "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
        . "github.com/henrylee2cn/pholcus/app/spider"           //必需
        // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
        // "github.com/henrylee2cn/pholcus/logs"
        // net包
         "net/http" //设置http.Header
        // "net/url"
        // 编码包
        // "encoding/xml"
        //"encoding/json"
        // 字符串处理包
        //"regexp"
        // "strconv"
            "strings"
        // 其他包
         "fmt"
        // "math"
        // "time"
    )
    
    
    func init() {
        FileTest1.Register()
    }
    
    
    var FileTest1 = &Spider{
        Name:        "meizitu图片下载",//url:http://www.meizitu.com/
        Description: "meizitu图片下载",
        // Pausetime: 300,
        // Keyin:   KEYIN,
        // Limit:        LIMIT,
        EnableCookie: false,
        RuleTree: &RuleTree{
            Root: func(ctx *Context) {
                ctx.AddQueue(&request.Request{
                    Url:          "http://www.meizitu.com/",
                    Rule:         "meizitu",
                    ConnTimeout:  -1,
                    DownloaderID: 0, //图片等多媒体文件必须使用0(surfer surf go原生下载器)
                })
                
            },
    
    
            Trunk: map[string]*Rule{
                "meizitu":{
                    ParseFunc: func(ctx *Context) {
                        query := ctx.GetDom()
                        query.Find("#picture p a").Each(func(i int, s *goquery.Selection) { //其中picture是id,
                        //p是tagname a也是tagname,这里就找到了a,而且是好多个,所以用了each进行循环
                            //s.Find('p>a').
                            fmt.Println("打印一下")
                            fmt.Println(s.Html())//s对象的内部HTML代码
                            fmt.Println("-----------")
                            url1,_:=s.Attr("href")//s的href属性值
                            fmt.Println(url1)
                            fmt.Println("??????????????")
                            //t:=s.Find('a').Eq(0)
                            //fmt.Println("A的html",t.Html())
                            if href, ok := s.Attr("href"); ok {
                                    ctx.AddQueue(&request.Request{
                                        Url:    href,
                                        Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
                                        Rule:   "图片URL",
                                    })
                                }
                            })
                    },
                },
    
    
                "图片URL": {
                    ParseFunc: func(ctx *Context) {
                        query := ctx.GetDom()
                        query.Find("#picture p img").Each(func(i int, s *goquery.Selection) {//同上
                            //s.Find('p>a').
                            fmt.Println("二级页面开始!打印一下????????????????????????????????????")
                            fmt.Println(s.Html())
                            fmt.Println("-----------")
                            url1,_:=s.Attr("src")
                            fmt.Println(url1)
                            fmt.Println("??????????????")
                            fmt.Println("二级页面结束!")
                            //t:=s.Find('a').Eq(0)
                            //fmt.Println("A的html",t.Html())
                            if href, ok := s.Attr("src"); ok {
                                    ctx.AddQueue(&request.Request{
                                        Url:    href,
                                        Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
                                        Rule:   "图片下载",
                                    })
                                }
                            
                            })
                    },
                },
                "图片下载": {
                    ParseFunc: func(ctx *Context) {
                        fmt.Println("图片链接URL:",ctx.GetUrl())
                        picurl:=ctx.GetUrl()//获取链接值,用于为图片命名,这里可能还会涉及新建文件夹,所以后面还需要想想怎么传参数?????
                        picname:=strings.Replace(picurl,"http://mm.howkuai.com/wp-content/uploads/","",-1)//替换值
                        picname=strings.Replace(picname,"/","-",-1)//替换值
                        ctx.FileOutput(picname) // 等价于ctx.AddFile("baidu")
                    },
                },
                
            },
        },
    }
    
    
  • 相关阅读:
    安装nginx后启动提示缺少libjemalloc.so.2
    页面刷新后保持滚动条的位置
    mysql的tinyint字段返回布true / false的问题
    MySql处理死锁的解决方案
    apidoc使用记录
    微信公众号开发图片上传案例
    [ Error 分析] Comparison method violates its general contract!
    [intellij]create gradle project
    [重构]读书笔记
    [设计模式]迭代子模式 Iterator
  • 原文地址:https://www.cnblogs.com/SSSR/p/6344816.html
Copyright © 2020-2023  润新知