• (golang)HTTP基本认证机制及使用gocolly登录爬取


    内网有个网页用了HTTP基本认证机制,想用gocolly爬取,不知道怎么登录,只好研究HTTP基本认证机制

    参考这里:https://www.jb51.net/article/89070.htm  

    下面开始参考作者dotcoo了:-)

    看了<<http权威指南>>第12章HTTP基本认证机制(本站下载地址://www.jb51.net/books/93254.html),感觉讲的蛮详细的,写了一个小小例子测试.

    请求响应过程:

    ==>
    GET /hello HTTP/1.1
    Host: 127.0.0.1:12345
    <==
    HTTP/1.1 401 Unauthorized
    WWW-Authenticate: Basic realm="Dotcoo User Login"
    ==>
    GET /hello HTTP/1.1
    Host: 127.0.0.1:12345
    Authorization: Basic YWRtaW46YWRtaW5wd2Q=
    <==
    HTTP/1.1 200 OK
    Content-Type: text/plain; charset=utf-8

    golang HTTP基本认证机制的实现代码

    package main
    import (
        "fmt"
        "io"
        "net/http"
        "log"
        "encoding/base64"
        "strings"
    )
    // hello world, the web server
    func HelloServer(w http.ResponseWriter, req *http.Request) {
        auth := req.Header.Get("Authorization")
        if auth == "" {
            w.Header().Set("WWW-Authenticate", `Basic realm="Dotcoo User Login"`)
            w.WriteHeader(http.StatusUnauthorized)
            return
        }
        fmt.Println(auth)
        auths := strings.SplitN(auth, " ", 2)
        if len(auths) != 2 {
            fmt.Println("error")
            return
        }
        authMethod := auths[0]
        authB64 := auths[1]
        switch authMethod {
        case "Basic":
            authstr, err := base64.StdEncoding.DecodeString(authB64)
            if err != nil {
                fmt.Println(err)
                io.WriteString(w, "Unauthorized!
    ")
                return
            }
            fmt.Println(string(authstr))
            userPwd := strings.SplitN(string(authstr), ":", 2)
            if len(userPwd) != 2 {
                fmt.Println("error")
                return
            }
            username := userPwd[0]
            password := userPwd[1]
            fmt.Println("Username:", username)
            fmt.Println("Password:", password)
            fmt.Println()
        default:
            fmt.Println("error")
            return
        }
        io.WriteString(w, "hello, world!
    ")
    }
    func main() {
        http.HandleFunc("/hello", HelloServer)
        err := http.ListenAndServe(":8000", nil)
        if err != nil {
            log.Fatal("ListenAndServe: ", err)
        }
    }

    试验了上面的例子后,基本明白了HTTP基本认证的过程。但是怎么用gocolly访问呢?

    参考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account

    但是答复者Matías Insaurralde提供的模拟浏览器访问的例子编译不通过,不明白其中的hptsKey的意思。代码放在下面供参考(可跳过):

    package evernote
    
    import (
        "bytes"
        "errors"
        "fmt"
        "io/ioutil"
        "net/http"
        "net/http/cookiejar"
        "net/url"
        "regexp"
        "strings"
    )
    
    const (
        evernoteLoginURL = "https://www.evernote.com/Login.action"
    )
    
    var (
        evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById("(.*)").value = "(.*)"`)
        evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)
    
        errNoMatches   = errors.New("No matches")
        errRedirectURL = errors.New("Redirect URL not found")
    )
    
    // EvernoteClient wraps all methods required to interact with the website.
    type EvernoteClient struct {
        Username   string
        Password   string
        httpClient *http.Client
    
        // These parameters persist during the login process:
        hpts  string
        hptsh string
    }
    
    // NewEvernoteClient initializes a new Evernote client.
    func NewEvernoteClient(username, password string) *EvernoteClient {
        // Allocate a new cookie jar to mimic the browser behavior:
        cookieJar, _ := cookiejar.New(nil)
    
        // Fill up basic data:
        c := &EvernoteClient{
            Username: username,
            Password: password,
        }
    
        // When initializing the http.Client, copy default values from http.DefaultClient
        // Pass a pointer to the cookie jar that was created earlier:
        c.httpClient = &http.Client{
            Transport:     http.DefaultTransport,
            CheckRedirect: http.DefaultClient.CheckRedirect,
            Jar:           cookieJar,
            Timeout:       http.DefaultClient.Timeout,
        }
        return c
    }
    
    func (e *EvernoteClient) extractJSParams(body []byte) (err error) {
        matches := evernoteJSParamsExpr.FindAllSubmatch(body, -1)
        if len(matches) == 0 {
            return errNoMatches
        }
        for _, submatches := range matches {
            if len(submatches) < 3 {
                err = errNoMatches
                break
            }
            key := submatches[1]
            val := submatches[2]
    
            if bytes.Compare(key, hptsKey) == 0 {
                e.hpts = string(val)
            }
            if bytes.Compare(key, hptshKey) == 0 {
                e.hptsh = string(val)
            }
        }
        return nil
    }
    
    // Login handles the login action.
    func (e *EvernoteClient) Login() error {
        // First step: fetch the login page as a browser visitor would do:
        res, err := e.httpClient.Get(evernoteLoginURL)
        if err != nil {
            return err
        }
        if res.Body == nil {
            return errors.New("No response body")
        }
        body, err := ioutil.ReadAll(res.Body)
        if err != nil {
            return err
        }
        err = e.extractJSParams(body)
        if err != nil {
            return err
        }
    
        // Second step: we have extracted the "hpts" and "hptsh" parameters
        // We send a request using only the username and setting "evaluateUsername":
        values := &url.Values{}
        values.Set("username", e.Username)
        values.Set("evaluateUsername", "")
        values.Set("analyticsLoginOrigin", "login_action")
        values.Set("clipperFlow", "false")
        values.Set("showSwitchService", "true")
        values.Set("hpts", e.hpts)
        values.Set("hptsh", e.hptsh)
    
        rawValues := values.Encode()
        req, err := http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
        if err != nil {
            return err
        }
        req.Header.Set("Accept", "application/json")
        req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
        req.Header.Set("x-requested-with", "XMLHttpRequest")
        req.Header.Set("referer", evernoteLoginURL)
        res, err = e.httpClient.Do(req)
        if err != nil {
            return err
        }
        body, err = ioutil.ReadAll(res.Body)
        if err != nil {
            return err
        }
        bodyStr := string(body)
        if !strings.Contains(bodyStr, `"usePasswordAuth":true`) {
            return errors.New("Password auth not enabled")
        }
    
        // Third step: do the final request, append password to form data:
        values.Del("evaluateUsername")
        values.Set("password", e.Password)
        values.Set("login", "Sign in")
    
        rawValues = values.Encode()
        req, err = http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
        if err != nil {
            return err
        }
        req.Header.Set("Accept", "text/html")
        req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
        req.Header.Set("x-requested-with", "XMLHttpRequest")
        req.Header.Set("referer", evernoteLoginURL)
        res, err = e.httpClient.Do(req)
        if err != nil {
            return err
        }
    
        // Check the body in order to find the redirect URL:
        body, err = ioutil.ReadAll(res.Body)
        if err != nil {
            return err
        }
        bodyStr = string(body)
        matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr, -1)
        if len(matches) == 0 {
            return errRedirectURL
        }
        m := matches[0]
        if len(m) < 2 {
            return errRedirectURL
        }
        redirectURL := m[1]
        fmt.Println("Login is ok, redirect URL:", redirectURL)
        return nil
    }
    After you successfully get the redirect URL, you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process, the cookie jar plays a very important role here.
    
    To call this code use:
    
    func main() {
        evernoteClient := NewEvernoteClient("user@company", "password")
        err := evernoteClient.Login()
        if err != nil {
            panic(err)
        }
    }

    只好自己写,经反复试验,发现对于本文开头自己写的server,只需以下代码即可通过验证,输出了hello,world!(将访问方式改为POST也一样。)

    package main
    
    import (
        "fmt"
    
        "io/ioutil"
        "net/http"
    )
    
    // Login handles the login action.
    func Login() {
        //生成client 参数为默认
        client := &http.Client{}
        //要访问的url
        url := "http://localhost:8000/hello"
        //要提交的请求
        req, _ := http.NewRequest("GET", url, nil)
        //最重要的一句,用户名和密码可随意写
        req.SetBasicAuth("aa", "bb")
        fmt.Println("POST访问")
        //返回结果
        res, _ := client.Do(req)
        defer res.Body.Close()
        fmt.Println("header:")
        header := res.Header
        fmt.Println(header)
        fmt.Println("realm:")
        basicRealm := res.Header.Get("Www-Authenticate")
        fmt.Println(basicRealm)
        fmt.Println("body:")
        body, _ := ioutil.ReadAll(res.Body)
        fmt.Println(string(body))
    
    }
    
    func main() {   
        Login()  
    }

    查看SetBasicAuth的定义为(liteide中在光标位置按Ctrl+shift+J):

    func (r *Request) SetBasicAuth(username, password string) {
        r.Header.Set("Authorization", "Basic "+basicAuth(username, password))
    }

    而basicAuth的定义为

    func basicAuth(username, password string) string {
        auth := username + ":" + password
        return base64.StdEncoding.EncodeToString([]byte(auth))
    }

    那么,用gocolly访问的代码如下:

    package main
    
    import (
        "encoding/base64"
        "fmt"
        "net/http"
    
        "github.com/gocolly/colly"
    )
    
    func basicAuth(username, password string) string {
        auth := username + ":" + password
        return base64.StdEncoding.EncodeToString([]byte(auth))
    }
    func main() {
        c := colly.NewCollector()
        h := http.Header{}
        h.Set("Authorization", "Basic "+basicAuth("aaaa", "bbbb"))
    
        c.OnResponse(func(r *colly.Response) {
            //fmt.Println(r)
            fmt.Println(string(r.Body))
        })
    
        c.Request("GET", "http://localhost:8000/hello", nil, nil, h)
    }

    注:对于其他网站,也许要用Fiddler抓包,设置相应的header和cookie才行。

  • 相关阅读:
    Mahout推荐算法ItemBased
    ALSA安装编程指南
    windbg更改cmd的token提升其特权
    至尊问题
    什么是“Bash”破绽?
    hdu 1548 A strange lift
    C 循环链表
    C++ 链表
    C_数据结构_链表的链式实现
    C _数据结构 _线性表的顺序存储
  • 原文地址:https://www.cnblogs.com/pu369/p/10408898.html
Copyright © 2020-2023  润新知