• 爬虫的验证码处理,图片懒加载,selenium和 PhantomJS,requests模块的session,线程池


    一.验证码处理

    1.云打码平台处理验证码的实现流程:

    1.对携带验证码的页面数据进行抓取
    2.可以将页面数据中验证码进行解析,验证码图片下载到本地
    3.可以将验证码图片提交给三方平台进行识别,返回验证码图片上的数据值
        - 云打码平台:
            - 1.在官网中进行注册(普通用户和开发者用户)
            - 2.登录开发者用户:
                - 1.实例代码的下载(开发文档-》调用实例及最新的DLL-》PythonHTTP实例下载)
                - 2.创建一个软件:我的软件-》添加新的软件
            -3.使用示例代码中的源码文件中的代码进行修改,让其识别验证码图片中的数据值

    示例:

    import http.client, mimetypes, urllib, json, time, requests
    
    ######################################################################
    
    class YDMHttp:
    
        apiurl = 'http://api.yundama.com/api.php'
        username = ''
        password = ''
        appid = ''
        appkey = ''
    
        def __init__(self, username, password, appid, appkey):
            self.username = username  
            self.password = password
            self.appid = str(appid)
            self.appkey = appkey
    
        def request(self, fields, files=[]):
            response = self.post_url(self.apiurl, fields, files)
            response = json.loads(response)
            return response
        
        def balance(self):
            data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['balance']
            else:
                return -9001
        
        def login(self):
            data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['uid']
            else:
                return -9001
    
        def upload(self, filename, codetype, timeout):
            data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
            file = {'file': filename}
            response = self.request(data, file)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['cid']
            else:
                return -9001
    
        def result(self, cid):
            data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
            response = self.request(data)
            return response and response['text'] or ''
    
        def decode(self, filename, codetype, timeout):
            cid = self.upload(filename, codetype, timeout)
            if (cid > 0):
                for i in range(0, timeout):
                    result = self.result(cid)
                    if (result != ''):
                        return cid, result
                    else:
                        time.sleep(1)
                return -3003, ''
            else:
                return cid, ''
    
        def report(self, cid):
            data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
            response = self.request(data)
            if (response):
                return response['ret']
            else:
                return -9001
    
        def post_url(self, url, fields, files=[]):
            for key in files:
                files[key] = open(files[key], 'rb');
            res = requests.post(url, files=files, data=fields)
            return res.text
    def getCodeDate(userName,pwd,codePath,codeType):
        # 用户名(普通用户)
        username    = userName
    
        # 密码
        password    = pwd                            
    
        # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appid       = 6003                                    
    
        # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
    
        # 图片文件
        filename    = codePath                       
    
        # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
        codetype    = codeType
    
        # 超时时间,秒
        timeout     = 2                                   
        result = None
        # 检查
        if (username == 'username'):
            print('请设置好相关参数再测试')
        else:
            # 初始化
            yundama = YDMHttp(username, password, appid, appkey)
    
            # 登陆云打码
            uid = yundama.login();
            #print('uid: %s' % uid)
    
            # 查询余额
            balance = yundama.balance();
            #print('balance: %s' % balance)
    
            # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
            cid, result = yundama.decode(filename, codetype, timeout);
            #print('cid: %s, result: %s' % (cid, result))
        return result
    #人人网的模拟登录
    import requests
    import urllib
    from lxml import etree
    #获取session对象
    session = requests.Session()
    #将验证码图片进行下载
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    url = 'http://www.renren.com/'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    urllib.request.urlretrieve(url=code_img_url,filename='code.jpg')
    
    #识别验证码图片中的数据值
    code_data = getCodeDate('bobo328410948','bobo328410948','./code.jpg',2004)
    
    #模拟登录
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201914927558'
    data = {
        "email":"www.zhangbowudi@qq.com",
        "icode":code_data,
        "origURL":"http://www.renren.com/home",
        "domain":"renren.com",
        "key_id":"1",
        "captcha_type":"web_login",
        "password":"4f0350f09aeffeef86307747218b214b0960bdf35e30811c0d611fe39db96ec1",
        "rkey":"9e75e8dc3457b14c55a74627fa64fb43",
        "f":"http%3A%2F%2Fwww.renren.com%2F289676607",
    }
    #该次请求产生的cookie会被自动存储到session对象中
    session.post(url=login_url,data=data,headers=headers)
    
    url = 'http://www.renren.com/289676607/profile'
    page_text = session.get(url=url,headers=headers).text
    
    with open('renren.html','w',encoding='utf-8') as fp:
        fp.write(page_text)

      模拟登录古诗文网

    import requests
    import urllib
    from lxml import etree
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    
    s = requests.Session()
    login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    page_text = requests.get(url=login_url,headers=headers).text
    tree = etree.HTML(page_text)
    img_src = 'https://so.gushiwen.org'+tree.xpath('//*[@id="imgCode"]/@src')[0]
    img_data = s.get(url=img_src,headers=headers).content
    with open('./img.jpg','wb') as fp:
        fp.write(img_data)
    img_text = getCodeDate('bobo328410948','bobo328410948','./img.jpg',1004)
    
    #模拟登录
    url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
    data = {
        "__VIEWSTATE":"9AsGvh3Je/0pfxId7DYRUi258ayuEG4rrQ1Z3abBgLoDSOeAUatOZOrAIxudqiOauXpR9Zq+dmKJ28+AGjXYHaCZJTTtGgrEemBWI1ed7oS7kpB7Rm/4yma/+9Q=",
        "__VIEWSTATEGENERATOR":"C93BE1AE",
        "from":"http://so.gushiwen.org/user/collect.aspx",
        "email":"www.zhangbowudi@qq.com",
        "pwd":"bobo328410948",
        "code":img_text,
        "denglu":"登录",
    }
    page_text = s.post(url=url,headers=headers,data=data).text
    with open('./gushiwen.html','w',encoding='utf-8') as fp:
        fp.write(page_text)

    二.图片懒加载,selenium,PhantomJS

    1.图片懒加载的概念:

      图片懒加载是一种网页优化技术。图片作为一种网络资源,在被请求时也与普通静态资源一样,将占用网络资源,而一次性将整个页面的所有图片加载完,将大大增加页面的首屏加载时间。为了解决这种问题,通过前后端配合,使图片仅在浏览器当前视窗内出现时才加载该图片,达到减少首屏图片请求数的技术就被称为“图片懒加载”。

    2.如何实现图片懒加载技术:

      在网页源码中,在img标签中首先会使用一个“伪属性”(通常使用src2,original......)去存放真正的图片链接而并非是直接存放在src属性中。当图片出现到页面的可视化区域中,会动态将伪属性替换成src属性,完成图片的加载。

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import requests
    from lxml import etree
    
    if __name__ == "__main__":
         url = 'http://sc.chinaz.com/tupian/gudianmeinvtupian.html'
         headers = {
             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
         }
         #获取页面文本数据
         response = requests.get(url=url,headers=headers)
         response.encoding = 'utf-8'
         page_text = response.text
         #解析页面数据(获取页面中的图片链接)
         #创建etree对象
         tree = etree.HTML(page_text)
         div_list = tree.xpath('//div[@id="container"]/div')
         #解析获取图片地址和图片的名称
         for div in div_list:
             image_url = div.xpath('.//img/@src'2) #src2伪属性
             image_name = div.xpath('.//img/@alt')
             print(image_url) #打印图片链接
             print(image_name)#打印图片名称

    3.selenium

      是Python的一个第三方库,对外提供的接口可以操作浏览器,然后让浏览器完成自动化的操作。

      环境的建立

    安装selenum:pip install selenium
    
    获取某一款浏览器的驱动程序(以谷歌浏览器为例)
    
      谷歌浏览器驱动下载地址:http://chromedriver.storage.googleapis.com/index.html
    
      下载的驱动程序必须和浏览器的版本统一,大家可以根据http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表进行对应

    示例:

    from selenium import webdriver
    from time import sleep
    
    bro = webdriver.Chrome(executable_path=r"E:python学习数据+爬虫第三天chromedriver_win32chromedriver.exe")
    bro.get(url='https://www.baidu.com/')
    sleep(2)
    text_input = bro.find_element_by_id("kw")
    
    text_input.send_keys('人民币')
    sleep(2)
    
    bro.find_element_by_id('su').click()
    
    sleep(3)
    
    print(bro.page_source)
    bro.quit()

    代码介绍:

    #导包
    from selenium import webdriver  
    #创建浏览器对象,通过该对象可以操作浏览器
    browser = webdriver.Chrome('驱动路径')
    #使用浏览器发起指定请求
    browser.get(url)
    
    #使用下面的方法,查找指定的元素进行操作即可
        find_element_by_id            根据id找节点
        find_elements_by_name         根据name找
        find_elements_by_xpath        根据xpath查找
        find_elements_by_tag_name     根据标签名找
        find_elements_by_class_name   根据class名字查找

      获取豆瓣电影中更多电影详情数据

    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe')
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    sleep(1)
    bro.quit()

    4.phantomJs

      PhantomJS是一款无界面的浏览器,其自动化操作流程和上述操作谷歌浏览器是一致的。由于是无界面的,为了能够展示自动化操作流程,PhantomJS为用户提供了一个截屏的功能,使用save_screenshot函数实现。

    #phantomJs
    #获取豆瓣电影中更多电影详情数据
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫phantomjs-2.1.1-windowsinphantomjs.exe')
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    sleep(1)
    bro.quit()

    5.谷歌无头浏览器

    #谷歌无头浏览器
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    #获取豆瓣电影中更多电影详情数据
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe',chrome_options=chrome_options)
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    print(page_text)
    sleep(1)
    bro.quit()

    三.requests模块的cookie和线程池

    1.cookie的概念:

      当用户通过浏览器首次访问一个域名时,访问的web服务器会给客户端发送数据,以保持web服务器与客户端之间的状态保持,这些数据就是cookie。

    2.cookie的作用:

      我们在浏览器中,经常涉及到数据的交换,比如你登录邮箱,登录一个页面。我们经常会在此时设置30天内记住我,或者自动登录选项,Cookie是由HTTP服务器设置的,保存在浏览器中,但HTTP协议是一种无状态协议,在数据交换完毕后,服务器端和客户端的链接就会关闭,每次交换数据都需要建立新的链接。

    3.思路:

      我们需要使用爬虫程序对人人网的登录时的请求进行一次抓取,获取请求中的cookie数据

      在使用个人信息页的url进行请求时,该请求需要携带 1 中的cookie,只有携带了cookie后,服务器才可识别这次请求的用户信息,方可响应回指定的用户信息页数据。

    示例:

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import requests
    if __name__ == "__main__":
    
        #登录请求的url(通过抓包工具获取)
        post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201873958471'
        #创建一个session对象,该对象会自动将请求中的cookie进行存储和携带
        session = requests.session()
       #伪装UA
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
        formdata = {
            'email': '17701256561',
            'icode': '',
            'origURL': 'http://www.renren.com/home',
            'domain': 'renren.com',
            'key_id': '1',
            'captcha_type': 'web_login',
            'password': '7b456e6c3eb6615b2e122a2942ef3845da1f91e3de075179079a3b84952508e4',
            'rkey': '44fd96c219c593f3c9612360c80310a3',
            'f': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dm7m_NSUp5Ri_ZrK5eNIpn_dMs48UAcvT-N_kmysWgYW%26wd%3D%26eqid%3Dba95daf5000065ce000000035b120219',
        }
        #使用session发送请求,目的是为了将session保存该次请求中的cookie
        session.post(url=post_url,data=formdata,headers=headers)
    
        get_url = 'http://www.renren.com/960481378/profile'
        #再次使用session进行请求的发送,该次请求中已经携带了cookie
        response = session.get(url=get_url,headers=headers)
        #设置响应内容的编码格式
        response.encoding = 'utf-8'
        #将响应内容写入文件
        with open('./renren.html','w') as fp:
            fp.write(response.text)

    2.基于multiprocessing.dummy线程池的数据爬取

    实例:

      爬取梨视频数据 

    普通爬取:

    import requests
    import random
    from lxml import etree
    import re
    from fake_useragent import UserAgent
    #安装fake-useragent库:pip install fake-useragent
    url = 'http://www.pearvideo.com/category_1'
    #随机产生UA,如果报错则可以添加如下参数:
    #ua = UserAgent(verify_ssl=False,use_cache_server=False).random
    #禁用服务器缓存:
    #ua = UserAgent(use_cache_server=False)
    #不缓存数据:
    #ua = UserAgent(cache=False)
    #忽略ssl验证:
    #ua = UserAgent(verify_ssl=False)
    
    ua = UserAgent().random
    headers = {
        'User-Agent':ua
    }
    #获取首页页面数据
    page_text = requests.get(url=url,headers=headers).text
    #对获取的首页页面数据中的相关视频详情链接进行解析
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
    detail_urls = []
    for li in li_list:
        detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        title = li.xpath('.//div[@class="vervideo-title"]/text()')[0]
        detail_urls.append(detail_url)
    for url in detail_urls:
        page_text = requests.get(url=url,headers=headers).text
        vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0]
        
        data = requests.get(url=vedio_url,headers=headers).content
        fileName = str(random.randint(1,10000))+'.mp4' #随机生成视频文件名称
        with open(fileName,'wb') as fp:
            fp.write(data)
            print(fileName+' is over')

    基于线程池的爬取

    import requests
    import re
    from lxml import etree
    from multiprocessing.dummy import Pool
    import random
    
    
    #实例化一个线程池对象
    pool = Pool(5)
    url = 'https://www.pearvideo.com/category_1'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
    
    video_url_list = []
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        detail_page = requests.get(url=detail_url,headers=headers).text
        video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
        video_url_list.append(video_url)
        
    video_data_list = pool.map(getVideoData,video_url_list)
    
    pool.map(saveVideo,video_data_list)
    def getVideoData(url):
        return requests.get(url=url,headers=headers).content
    
    def saveVideo(data):
        fileName = str(random.randint(0,5000))+'.mp4'
        with open(fileName,'wb') as fp:
            fp.write(data)
  • 相关阅读:
    python spark 求解最大 最小 平均
    python spark 求解最大 最小 平均 中位数
    我的spark python 决策树实例
    python spark 随机森林入门demo
    python spark 决策树 入门demo
    php 使用curl发起https请求
    awk调用shell命令的两种方法:system与print
    js 中 setTimeout()的用法
    Linux里AWK中split函数的用法
    awk substr()函数
  • 原文地址:https://www.cnblogs.com/chenxi67/p/10452759.html
Copyright © 2020-2023  润新知