• 15.scrapy模拟登陆案例


    1.案例一

    a.创建项目

    scrapy startproject renren_login

    进入项目路径

    scrapy genspider renren "renren.com"

    renren.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class RenrenSpider(scrapy.Spider):
        name = 'renren'
        allowed_domains = ['renren.com']
        start_urls = ['http://renren.com/']
    
        def start_requests(self):
            url="http://www.renren.com/PLogin.do"
            data={"email":"xxxxxxxx@126.com","password":"xxxxxxx"}
            request=scrapy.FormRequest(url,formdata=data,callback=self.parse_page)
            yield request
    
        def parse_page(self, response):
            request=scrapy.Request(url='http://www.renren.com/326282648/profile',callback=self.parse_profile)
            yield request
    
        def parse_profile(self,response):
            with open("wenliang.html","w",encoding="utf-8") as fp:
                fp.write(response.text)

    在项目路径下创建start.py

    from scrapy import cmdline
    cmdline.execute(["scrapy","crawl","renren"])

    2.案例2

     a.手动输入验证码

    创建项目

    scrapy startproject douban_login

    进去项目路径

    scrapy genspider douban "douban.com"

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for douban_login project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'douban_login'
    
    SPIDER_MODULES = ['douban_login.spiders']
    NEWSPIDER_MODULE = 'douban_login.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'douban_login (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    #ITEM_PIPELINES = {
    #    'douban_login.pipelines.DoubanLoginPipeline': 300,
    #}
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    douban.py

    # -*- coding: utf-8 -*-
    import scrapy
    from urllib import request
    from PIL import Image
    class DoubanSpider(scrapy.Spider):
        name = 'douban'
        allowed_domains = ['douban.com']
        start_urls = ['https://www.douban.com/login']
        login_url="https://www.douban.com/login"
        profile_url="https://www.douban.com/people/184480369/"
        editsignature_url="https://www.douban.com/j/people/184480369/edit_signature"
    
        def parse(self, response):
            formdata={
                "source":"None",
                "redir":"https://www.douban.com/",
                "form_email":"xxxxxx@qq.com",
                "form_password":"xxxxxx!",
                "remember":"on",
                "login":"登录"
            }
    
            captcha_url=response.css("img#captcha_image::attr(src)").get()
    
            if captcha_url:
                captcha=self.regonize_captcha(captcha_url)
                formdata["captcha-solution"]=captcha
                captcha_id=response.xpath("//input[@name='captcha-id']/@value").get()
                formdata["captcha-id"]=captcha_id
                yield scrapy.FormRequest(url=self.login_url,formdata=formdata,callback=self.parse_after_login)
    
        def parse_after_login(self,response):
            if response.url=="https://www.douban.com/":
                yield scrapy.Request(self.profile_url,callback=self.parse_profile)
                print("登录成功")
            else:
                print("登录失败")
    
        def parse_profile(self,response):
            print(response.url)
            if response.url==self.profile_url:
                print("进入到了个人中心")
                ck=response.xpath("//input[@name='ck']/@value").get()
                formdata={
                    "ck":ck,
                    "signature":"丈夫处世兮立功名"
                }
                yield scrapy.FormRequest(self.editsignature_url,formdata=formdata)
            else:
                print("没有进入个人中心")
    
    
        def regonize_captcha(self,image_url):
            request.urlretrieve(image_url,"captcha.png")
            image=Image.open("captcha.png")
            image.show()
            captcha=input("请输入验证码:")
            return captcha

    在douban_login目录下创建start.py

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl douban".split())

    执行start.py即可

    b.自动识别验证码

    from urllib import request
    from base64 import b64decode
    import requests
    
    captcha_url="https://www.douban.com/misc/captcha?id=TCEAV2F8SbBgKbXZ5JAI2G6L:en&size=s"
    request.urlretrieve(captcha_url,"captcha.png")
    
    recognize_url="http://xxxxxx"
    formdata={}
    with open("captcha.png","rb") as fp:
        data=fp.read()
        pic=b64decode(data)
        formdata['pic']=pic
    
    appcode='xxxxxxxxxxxxxxx'
    headers={
        "Content-Type":"application/x-www-form-urlencode; charset=UTF-8",
        'Authorization':'APPCODE'+appcode
    }
    response=requests.post(recognize_url,data=formdata,headers=headers)
    print(response)

    c.其他自动识别案例

    from selenium import webdriver
    import time
    import requests
    from lxml import etree
    import base64
    
    # 操作浏览器
    driver = webdriver.Chrome()
    url = 'https://accounts.douban.com/login?alias=&redir=https%3A%2F%2Fwww.douban.com%2F&source=index_nav&error=1001'
    
    driver.get(url)
    time.sleep(1)
    driver.find_element_by_id('email').send_keys('18510556963')
    time.sleep(1)
    driver.find_element_by_id('password').send_keys('yaoqinglin2011')
    time.sleep(1)
    
    # 获取验证码相关信息
    html_str = driver.page_source
    html_ele = etree.HTML(html_str)
    # 得到验证码的url
    image_url = html_ele.xpath('//img[@id="captcha_image"]/@src')[0]
    # 获取这个图片的内容
    response = requests.get(image_url)
    
    # 获取base64的str
    #  https://market.aliyun.com/products/57124001/cmapi028447.html?spm=5176.2020520132.101.5.2HEXEG#sku=yuncode2244700000
    b64_str = base64.b64encode(response.content)
    v_type = 'cn'
    # post 提交打码平台的数据
    form = {
        'v_pic': b64_str,
        'v_type': v_type,
    }
    
    # authtication的header
    headers = {
        'Authorization': 'APPCODE eab23fa1d03f40d48b43c826c57bd284',
    }
    # 从打码平台获取验证码信息
    dmpt_url = 'http://yzmplus.market.alicloudapi.com/fzyzm'
    response = requests.post(dmpt_url, form, headers=headers)
    print(response.text)
    # captcha_value 就是我们的验证码信息
    captcha_value = response.json()['v_code']
    
    print(image_url)
    print(captcha_value)
    # captcha_value = input('请输入验证码')
    
    driver.find_element_by_id('captcha_field').send_keys(captcha_value)
    time.sleep(1)
    driver.find_element_by_class_name('btn-submit').click()
    time.sleep(1)
    # 获取所有的cookie的信息
    cookies = driver.get_cookies()
    cookie_list =[]
    
    # 对于每一个cookie_dict, 就是将name 和 value取出, 拼接成name=value;
    for cookie_dict in cookies:
        cookie_str = cookie_dict['name'] + '=' + cookie_dict['value']
        cookie_list.append(cookie_str)
    
    # 拼接所有的cookie到header_cookie中
    header_cookie = '; '.join(cookie_list)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Cookie': header_cookie,
    }
    another_url = 'https://www.douban.com/accounts/'
    response = requests.get(another_url, headers=headers)
    
    with open('cc.html', 'wb') as f:
        f.write(response.content)
    
    
    # with open('douban.html', 'wb') as f:
    #     f.write(driver.page_source.encode('utf-8'))
  • 相关阅读:
    转:SVN常见问题与解决方法
    Mac OS 下的解压缩软件——The Unarchiver
    Django报错 The serializer field might be named incorrectly and not match any Got AttributeError when attempting to get a value for field `author_for` on serializer `KnownledgeBaseListSerializer`
    Django 生成数据库表时的报错TypeError: __init__() missing 1 required positional argument: 'on_delete'
    webstorm不能中文输入问题
    npm报错This is probably not a problem with npm. There is likely additional logging
    Django 报错no sucn column: OpretionalError
    Python 报错 AttributeError: module 'django.db.models' has no attribute 'SubfieldBase'
    详解Django中Request对象的相关用法
    Python中import, from...import,import...as的区别
  • 原文地址:https://www.cnblogs.com/hbxZJ/p/9641373.html
Copyright © 2020-2023  润新知