15.scrapy模拟登陆案例

1.案例一

a.创建项目

scrapy startproject renren_login

进入项目路径

scrapy genspider renren "renren.com"

renren.py

# -*- coding: utf-8 -*-
import scrapy


class RenrenSpider(scrapy.Spider):
    name = 'renren'
    allowed_domains = ['renren.com']
    start_urls = ['http://renren.com/']

    def start_requests(self):
        url="http://www.renren.com/PLogin.do"
        data={"email":"xxxxxxxx@126.com","password":"xxxxxxx"}
        request=scrapy.FormRequest(url,formdata=data,callback=self.parse_page)
        yield request

    def parse_page(self, response):
        request=scrapy.Request(url='http://www.renren.com/326282648/profile',callback=self.parse_profile)
        yield request

    def parse_profile(self,response):
        with open("wenliang.html","w",encoding="utf-8") as fp:
            fp.write(response.text)

在项目路径下创建start.py

from scrapy import cmdline
cmdline.execute(["scrapy","crawl","renren"])

2.案例2

a.手动输入验证码

创建项目

scrapy startproject douban_login

进去项目路径

scrapy genspider douban "douban.com"

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for douban_login project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban_login'

SPIDER_MODULES = ['douban_login.spiders']
NEWSPIDER_MODULE = 'douban_login.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'douban_login (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'douban_login.pipelines.DoubanLoginPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

douban.py

# -*- coding: utf-8 -*-
import scrapy
from urllib import request
from PIL import Image
class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    start_urls = ['https://www.douban.com/login']
    login_url="https://www.douban.com/login"
    profile_url="https://www.douban.com/people/184480369/"
    editsignature_url="https://www.douban.com/j/people/184480369/edit_signature"

    def parse(self, response):
        formdata={
            "source":"None",
            "redir":"https://www.douban.com/",
            "form_email":"xxxxxx@qq.com",
            "form_password":"xxxxxx!",
            "remember":"on",
            "login":"登录"
        }

        captcha_url=response.css("img#captcha_image::attr(src)").get()

        if captcha_url:
            captcha=self.regonize_captcha(captcha_url)
            formdata["captcha-solution"]=captcha
            captcha_id=response.xpath("//input[@name='captcha-id']/@value").get()
            formdata["captcha-id"]=captcha_id
            yield scrapy.FormRequest(url=self.login_url,formdata=formdata,callback=self.parse_after_login)

    def parse_after_login(self,response):
        if response.url=="https://www.douban.com/":
            yield scrapy.Request(self.profile_url,callback=self.parse_profile)
            print("登录成功")
        else:
            print("登录失败")

    def parse_profile(self,response):
        print(response.url)
        if response.url==self.profile_url:
            print("进入到了个人中心")
            ck=response.xpath("//input[@name='ck']/@value").get()
            formdata={
                "ck":ck,
                "signature":"丈夫处世兮立功名"
            }
            yield scrapy.FormRequest(self.editsignature_url,formdata=formdata)
        else:
            print("没有进入个人中心")


    def regonize_captcha(self,image_url):
        request.urlretrieve(image_url,"captcha.png")
        image=Image.open("captcha.png")
        image.show()
        captcha=input("请输入验证码:")
        return captcha

在douban_login目录下创建start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl douban".split())

执行start.py即可

b.自动识别验证码

from urllib import request
from base64 import b64decode
import requests

captcha_url="https://www.douban.com/misc/captcha?id=TCEAV2F8SbBgKbXZ5JAI2G6L:en&size=s"
request.urlretrieve(captcha_url,"captcha.png")

recognize_url="http://xxxxxx"
formdata={}
with open("captcha.png","rb") as fp:
    data=fp.read()
    pic=b64decode(data)
    formdata['pic']=pic

appcode='xxxxxxxxxxxxxxx'
headers={
    "Content-Type":"application/x-www-form-urlencode; charset=UTF-8",
    'Authorization':'APPCODE'+appcode
}
response=requests.post(recognize_url,data=formdata,headers=headers)
print(response)

c.其他自动识别案例

from selenium import webdriver
import time
import requests
from lxml import etree
import base64

# 操作浏览器
driver = webdriver.Chrome()
url = 'https://accounts.douban.com/login?alias=&redir=https%3A%2F%2Fwww.douban.com%2F&source=index_nav&error=1001'

driver.get(url)
time.sleep(1)
driver.find_element_by_id('email').send_keys('18510556963')
time.sleep(1)
driver.find_element_by_id('password').send_keys('yaoqinglin2011')
time.sleep(1)

# 获取验证码相关信息
html_str = driver.page_source
html_ele = etree.HTML(html_str)
# 得到验证码的url
image_url = html_ele.xpath('//img[@id="captcha_image"]/@src')[0]
# 获取这个图片的内容
response = requests.get(image_url)

# 获取base64的str
#  https://market.aliyun.com/products/57124001/cmapi028447.html?spm=5176.2020520132.101.5.2HEXEG#sku=yuncode2244700000
b64_str = base64.b64encode(response.content)
v_type = 'cn'
# post 提交打码平台的数据
form = {
    'v_pic': b64_str,
    'v_type': v_type,
}

# authtication的header
headers = {
    'Authorization': 'APPCODE eab23fa1d03f40d48b43c826c57bd284',
}
# 从打码平台获取验证码信息
dmpt_url = 'http://yzmplus.market.alicloudapi.com/fzyzm'
response = requests.post(dmpt_url, form, headers=headers)
print(response.text)
# captcha_value 就是我们的验证码信息
captcha_value = response.json()['v_code']

print(image_url)
print(captcha_value)
# captcha_value = input('请输入验证码')

driver.find_element_by_id('captcha_field').send_keys(captcha_value)
time.sleep(1)
driver.find_element_by_class_name('btn-submit').click()
time.sleep(1)
# 获取所有的cookie的信息
cookies = driver.get_cookies()
cookie_list =[]

# 对于每一个cookie_dict, 就是将name 和 value取出, 拼接成name=value;
for cookie_dict in cookies:
    cookie_str = cookie_dict['name'] + '=' + cookie_dict['value']
    cookie_list.append(cookie_str)

# 拼接所有的cookie到header_cookie中
header_cookie = '; '.join(cookie_list)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Cookie': header_cookie,
}
another_url = 'https://www.douban.com/accounts/'
response = requests.get(another_url, headers=headers)

with open('cc.html', 'wb') as f:
    f.write(response.content)


# with open('douban.html', 'wb') as f:
#     f.write(driver.page_source.encode('utf-8'))

相关阅读:
转：SVN常见问题与解决方法
 Mac OS 下的解压缩软件——The Unarchiver
Django报错 The serializer field might be named incorrectly and not match any Got AttributeError when attempting to get a value for field `author_for` on serializer `KnownledgeBaseListSerializer`
Django 生成数据库表时的报错TypeError: __init__() missing 1 required positional argument: 'on_delete'
webstorm不能中文输入问题
 npm报错This is probably not a problem with npm. There is likely additional logging
Django 报错no sucn column: OpretionalError
Python 报错 AttributeError: module 'django.db.models' has no attribute 'SubfieldBase'
详解Django中Request对象的相关用法
 Python中import, from...import,import...as的区别
原文地址：https://www.cnblogs.com/hbxZJ/p/9641373.html