• 11.Scrapy登录


    Request

    Request 部分源码:

    # 部分代码
    class Request(object_ref):
    
        def __init__(self, url, callback=None, method='GET', headers=None, body=None, 
                     cookies=None, meta=None, encoding='utf-8', priority=0,
                     dont_filter=False, errback=None):
    
            self._encoding = encoding  # this one has to be set first
            self.method = str(method).upper()
            self._set_url(url)
            self._set_body(body)
            assert isinstance(priority, int), "Request priority not an integer: %r" % priority
            self.priority = priority
    
            assert callback or not errback, "Cannot use errback without a callback"
            self.callback = callback
            self.errback = errback
    
            self.cookies = cookies or {}
            self.headers = Headers(headers or {}, encoding=encoding)
            self.dont_filter = dont_filter
    
            self._meta = dict(meta) if meta else None
    
        @property
        def meta(self):
            if self._meta is None:
                self._meta = {}
            return self._meta

     其中,比较常用的参数:

    url: 就是需要请求,并进行下一步处理的url
    
    callback: 指定该请求返回的Response,由那个函数来处理。
    
    method: 请求一般不需要指定,默认GET方法,可设置为"GET", "POST", "PUT"等,且保证字符串大写
    
    headers: 请求时,包含的头文件。一般不需要。内容一般如下:
            # 自己写过爬虫的肯定知道
            Host: media.readthedocs.org
            User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0
            Accept: text/css,*/*;q=0.1
            Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
            Accept-Encoding: gzip, deflate
            Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/
            Cookie: _ga=GA1.2.1612165614.1415584110;
            Connection: keep-alive
            If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT
            Cache-Control: max-age=0
    
    meta: 比较常用,在不同的请求之间传递数据使用的。字典dict型
    
            request_with_cookies = Request(
                url="http://www.example.com",
                cookies={'currency': 'USD', 'country': 'UY'},
                meta={'dont_merge_cookies': True}
            )
    
    encoding: 使用默认的 'utf-8' 就行。
    
    dont_filter: 表明该请求不由调度器过滤。这是当你想使用多次执行相同的请求,忽略重复的过滤器。默认为False。
    
    errback: 指定错误处理函数

    Response

    # 部分代码
    class Response(object_ref):
        def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
            self.headers = Headers(headers or {})
            self.status = int(status)
            self._set_body(body)
            self._set_url(url)
            self.request = request
            self.flags = [] if flags is None else list(flags)
    
        @property
        def meta(self):
            try:
                return self.request.meta
            except AttributeError:
                raise AttributeError("Response.meta not available, this response " 
                    "is not tied to any request")

    大部分参数和上面的差不多:

    status: 响应码
    _set_body(body): 响应体
    _set_url(url):响应url
    self.request = request

    发送POST请求

    • 可以使用 yield scrapy.FormRequest(url, formdata, callback)方法发送POST请求。

    • 如果希望程序执行一开始就发送POST请求,可以重写Spider类的start_requests(self) 方法,并且不再调用start_urls里的url。

    class mySpider(scrapy.Spider):
        # start_urls = ["http://www.example.com/"]
    
        def start_requests(self):
            url = 'http://www.renren.com/PLogin.do'
    
            # FormRequest 是Scrapy发送POST请求的方法
            yield scrapy.FormRequest(
                url = url,
                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},
                callback = self.parse_page
            )
        def parse_page(self, response):
            # do something

    模拟登陆

    使用FormRequest.from_response()方法模拟用户登录

    通常网站通过 实现对某些表单字段(如数据或是登录界面中的认证令牌等)的预填充。

    使用Scrapy抓取网页时,如果想要预填充或重写像用户名、用户密码这些表单字段, 可以使用 FormRequest.from_response() 方法实现。

    renren.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    #根据cookie进行登录,实在没办法了,可以用这种方法模拟登录,麻烦一点,成功率100%
    class RenrenSpider(scrapy.Spider):
        name = "renren"
        allowed_domains = ["renren.com"]
        start_urls = (
            # 'http://www.renren.com/xxxxx',
            'http://www.renren.com/11111',
            # 'http://www.renren.com/xx',
        )
    
        # 帐号登录后的cookie值
        cookies = {
            "anonymid": "ixrna3fysufnwv",
            "_r01_": "1",
            "ap": "327550029",
            "JSESSIONID": "abciwg61A_RvtaRS3GjOv",
            "depovince": "GW",
            "springskin": "set",
            "jebe_key": "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950",
            "jebe_key": "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198619601",
            "ver": "7.0",
            "XNESSESSIONID": "e703b11f8809",
            "jebecookies": "98c7c881-779f-4da8-a57c-7464175cd469|||||",
            "ick_login": "4b4a254a-9f25-4d4a-b686-a41fda73e173",
            "_de": "BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5",
            "p": "ea5541736f993365a23d04c0946c10e29",
            "first_login_flag": "1",
            "ln_uact": "mr_mao_hacker@163.com",
            "ln_hurl": "http://hdn.xnimg.cn/photos/hdn521/20140529/1055/h_main_9A3Z_e0c300019f6a195a.jpg",
            "t": "691808127750a83d33704a565d8340ae9",
            "societyguester": "691808127750a83d33704a565d8340ae9",
            "id": "327550029",
            "xnsid": "f42b25cf",
            "loginfrom": "syshome"
        }
    
        #开始发送网站请求时调用该方法
        def start_requests(self):
            for url in self.start_urls:
                # yield scrapy.Request(url, callback = self.parse)
                # url = "http://www.renren.com/410043129/profile",登录进去后的页面
                yield scrapy.FormRequest(url, cookies=self.cookies, callback=self.parse_page)
    
        def parse_page(self, response):
            print("===========" + response.url)
            with open("deng.html", "wb") as filename:
                filename.write(response.body)

    renren1.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class Reren1Spider(scrapy.Spider):
        name = "reren1"
        allowed_domains = ["renren.com"]
    
        def start_requests(self):
            url = 'http://www.renren.com/PLogin.do'
            yield scrapy.FormRequest(
                url=url,
                formdata={"email": "mr_mao_hacker@163.com", "password": "alarmchime"},
                callback=self.parse_page)
    
        def parse_page(self, response):
            #登录后将登录页面写入到文件中
            print(response.body)
            with open("mao2.html", "wb") as filename:
                filename.write(response.body)

    renren2.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    # 正统模拟登录方法:
    # 首先发送登录页面的get请求,获取到页面里的登录必须的参数,比如说zhihu的 _xsrf
    # 然后和账户密码一起post到服务器,登录成功
    
    class Renren2Spider(scrapy.Spider):
        name = 'renren2'
        allowed_domains = ['renren.com']
        start_urls = ["http://www.renren.com/PLogin.do",]
    
        def parse(self, response):
            #验证登录是否成功
            yield scrapy.FormRequest.from_response(
                response,
                formdata={"email" : "mr_mao_hacker@163.com", "password" : "alarmchime"},#, "_xsrf" = _xsrf},
                callback=self.parse_page
            )
    
    
        def parse_page(self,response):
            #登录成功后,点击进入好友的页面
            url="http://www.renren.com/422167102/profile"
            yield scrapy.Request(url,callback=self.parse_newpage)
    
    
        def parse_newpage(self,response):
            with open("xiao.html", "wb") as filename:
                filename.write(response.body)
  • 相关阅读:
    网站压力测试工具----webbench
    skin.xml
    krpano和react的结合展示
    swiper的相关用法
    js的hover实现方法。
    js的slice和split和splice和substring和substr的区别
    使用JSSDK分享页面
    微信jssdk分享功能开发
    点击复制
    JS的document.all函数使用 示例
  • 原文地址:https://www.cnblogs.com/weihu/p/9069493.html
Copyright © 2020-2023  润新知