• 纯golang爬虫实战(四)-POST登录半成品以及利用fiddler抓包


    补充说明:按照本文方式,之所以能成功获取登录后的网页,实际上是由于在代码中使用了浏览器成功登录后的cookie,此时服务器上SessionID有效。而一旦从浏览器中注销登录,本代码也就无法访问登录后的网页。

    因此,正确方式请参考下一篇文章:https://www.cnblogs.com/pu369/p/12307162.html

    前几篇文章摸索了如何用golang模拟登录;对简单的情况,甚至可以用GET方式代替POST方式登录。

    但现在想抓取公司OA系统的网页,就遇到一些困难。

    难点:

    1、登录页POST提交后,会有http 302跳转。后来发现似乎不是问题,因为用http.Client.Do会自动处理跳转。

    2、网站使用了FrameSet,还有大量jquery动态生成加载内容,这个处理起来比较困难,链接也是JS生成,只能针对具体问题具体分析。

    3、用chrome控制台抓到的header提交不成功,于是想到fiddler抓包,正好电脑上曾经下载过一个汉化版。

    过程(主要是解决了POST登录问题):

    1、参考https://blog.csdn.net/qq_24373725/article/details/80584810  用fiddler抓包(我下载的汉化版本一启动就自动抓所有包,感觉很方便,就是不知道有无后门),在过滤器中设置服务器网址:192.168.132.80;在 规则-自动断点处-勾选 在请求之前。

    2、在IE中从登录页输入用户名、密码后提交登录。就会在fiddler中抓到表单对应的提交网址:/login/VerifyLogin.jsp,接着在断点处中断了。点击相应的: 运行到结束,或:中断响应(对于rameset中的一些页面可以中断响应,框架网页的一部分会显示空白),经过几次鼠标点击操作,一串页面先后执行完毕。

    3、最重要的还是form表单对应的/login/VerifyLogin.jsp页(其他网页其实没什么用),查看抓包到的原始数据为(我将用户名和密码改了:-)

    POST http://192.168.132.80/login/VerifyLogin.jsp HTTP/1.1
    Accept: text/html, application/xhtml+xml, */*
    Referer: http://192.168.132.80/wui/theme/ecology7/page/login.jsp?templateId=6&logintype=1&gopage=&languageid=7&message=16
    Accept-Language: zh-CN
    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
    Content-Type: application/x-www-form-urlencoded
    Accept-Encoding: gzip, deflate
    Host: 192.168.132.80
    Content-Length: 284
    Connection: Keep-Alive
    Pragma: no-cache
    Cookie: logincookiecheck=1581132245967+C1D3FCB434C8223BE9C4CE5AD9497183; JSESSIONID=abc67CXfxpBtu9aM2VR-w; testBanCookie=test; loginfileweaver=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D; loginidweaver=114; languageidweaver=7
    
    loginfile=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D&logintype=1&fontName=%CE%A2%EF%BF%BD%EF%BF%BD%EF%BF%BD%C5%BA%EF%BF%BD&message=16&gopage=&formmethod=post&rnd=&serial=&username=&isie=true&loginid=admin&userpassword=1234&submit=

    4、试着将上述数据写入go代码中,直接上代码:

    //参考:https://blog.csdn.net/kenkao/article/details/88844212
    //http.Get  http.Post http.PostForm http.Client.Do四种请求方式
    //https://blog.csdn.net/qq_24373725/article/details/80584810
    package main
    
    import (
        "fmt"
        "io/ioutil"
        "net/http"
        "net/http/cookiejar"
    
        //    "net/url"
        "strings"
        "time"
    
        "golang.org/x/text/encoding/simplifiedchinese"
    )
    
    type MySpider struct {
        indexUrl string
        cleint   *http.Client
    }
    
    //登录,用POST请求
    func (this MySpider) login() (string, error) {
        //访问首页
        resp, err := this.cleint.Get(this.indexUrl)
        defer resp.Body.Close()
        time.Sleep(time.Duration(300) * time.Microsecond)
    
        //POST提交
        //post_arg := url.Values{"loginid": {"admin"}, "userpassword": {"1234"}}
        post_arg := "loginfile=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D&logintype=1&fontName=%CE%A2%EF%BF%BD%EF%BF%BD%EF%BF%BD%C5%BA%EF%BF%BD&message=16&gopage=&formmethod=post&rnd=&serial=&username=&isie=true&loginid=admin&userpassword=1234&submit="
        //fmt.Println(post_arg.Encode())
        //req, err := http.NewRequest("POST", "http://192.168.132.80/login/VerifyLogin.jsp", strings.NewReader(post_arg.Encode()))
        req, err := http.NewRequest("POST", "http://192.168.132.80/login/VerifyLogin.jsp", strings.NewReader(post_arg))
        if err != nil {
            // handle error
        }
        req.Header.Set("Accept", "text/html, application/xhtml+xml, */*")
        req.Header.Set("Referer", "http://192.168.132.80/wui/theme/ecology7/page/login.jsp?templateId=6&logintype=1&gopage=&languageid=7&message=16")
        req.Header.Set("Accept-Language", "zh-CN")
        req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36")
        req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
        req.Header.Set("Accept-Encoding", "gzip, deflate")
        req.Header.Set("Host", "192.168.132.80")
        req.Header.Set("Content-Length", "284")
        req.Header.Set("Connection", "keep-alive")
        req.Header.Set("Pragma", "no-cache")
        req.Header.Set("Cookie", "logincookiecheck=1581132245967+C1D3FCB434C8223BE9C4CE5AD9497183; JSESSIONID=abc67CXfxpBtu9aM2VR-w; testBanCookie=test; loginfileweaver=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D; loginidweaver=114; languageidweaver=7")
    
        //req.Header.Set("Cache-Control", "no-cache")
        //req.Header.Set("Origin", "http://192.168.132.80")
    
        //req.Header.Set("Upgrade-Insecure-Requests:", "1")
        a := req.Header.Get("Referer")
        fmt.Println(string(a))
        //b, err := ioutil.ReadAll(req.Body)
        //fmt.Println(string(b))
        resp, err = this.cleint.Do(req)
        defer resp.Body.Close()
        reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
        body, err := ioutil.ReadAll(reader)
        if err != nil {
            // handle error
        }
        fmt.Println(string(body))
    
        //访问登录后才能访问的页面
        resp, err = this.cleint.Get("http://192.168.132.80/CRM/data/CustomerBrowser.jsp?splitflag=")
        defer resp.Body.Close()
        //time.Sleep(time.Duration(1000) * time.Microsecond)
        reader = simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
        body, err = ioutil.ReadAll(reader)
        fmt.Println(string(body))
    
        return "", err
    }
    
    //运行
    func (this MySpider) run() string {
        //生成可复用的client
        var client http.Client
        jar, err := cookiejar.New(nil)
        if err != nil {
            panic(err)
        }
        client.Jar = jar
        this.cleint = &client
        //登录,用GET代替POST请求
        this.login()
        return ""
    }
    
    func main() {
        //爬虫实例
        ms := new(MySpider)
        //入口地址http://192.168.133.16:8080
        ms.indexUrl = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp"
        ms.run()
    }

    让人感到高兴的是,代码中:

    1.http.NewRequest 所需参数,也就是表单数据,应该叫body部分吧(与http头 有一个空行间隔开),可以直接从抓包到的 原始 数据 中一次复制出来。见代码中的post_arg变量值。

    2.http header(请求头部分)也可直观在从抓包到的 原始 数据 中看到。即代码中req.Header.Set设置的那些http头内容。

    当然,遗留的问题还较难解决,如:获取jquery异步加载,打开javascript链接对应的网址、找frameset中的元素。解决办法应该还是要用headless无头浏览器处理。

    也可用chrome控制台获取http头和formdata,还可以方便查看XHR异步加载请求。参考:https://www.cnblogs.com/LXP-Never/p/11374795.html(这位园主关于爬虫的几篇文章不错)

    但有的请求chrome console截获不到,还得用fiddler.

  • 相关阅读:
    Goal driven performance optimization
    Using SHOW PROCESSLIST and mysqladmin debug Output in Conjunction with SHOW INNODB STATUS
    Concurrent inserts on MyISAM and the binary log
    A better SHOW TABLE STATUS
    show table status
    A Flock Of Tasty Sources On How To Start Learning High Scalability
    PostgreSQL Hardware Performance Tuning
    Choosing proper innodb_log_file_size
    ffmpeg 常用命令
    opencv 知识点笔记
  • 原文地址:https://www.cnblogs.com/pu369/p/12283458.html
Copyright © 2020-2023  润新知