• 路飞学城—Python爬虫实战密训班 第二章


     

    路飞学城—Python爬虫实战密训班 第二章

     

    一、Selenium基础

      Selenium是一个第三方模块,可以完全模拟用户在浏览器上操作(相当于在浏览器上点点点)。

      1.安装

        - pip install selenium

      2.优缺点

        - 无需查看和确定请求头请求体等数据细节,直接模拟人点击浏览器的行为

        - 效率不高

      3.依赖驱动:

           - Firefox
            https://github.com/mozilla/geckodriver/releases
           - Chrome
            http://chromedriver.storage.googleapis.com/index.html

      4.与selenium相关的基本操作

    from selenium import webdriver
    
    # 配置驱动
    #驱动一定要自己下载并放在一个目录,否则会出错
    
    option = webdriver.ChromeOptions()
    driver = webdriver.Chrome('/Users/wupeiqi/drivers/chromedriver', chrome_options=option)
    
    # 1. 控制浏览器打开指定页面
    driver.get("https://dig.chouti.com/all/hot/recent/1")
    
    
    # 2. 找到登录按钮
    btn_login = driver.find_element_by_xpath('//*[@id="login-link-a"]')
    # 3. 点击按钮
    btn_login.click()
    
    # 4. 找到手机标签
    input_user = driver.find_element_by_xpath('//*[@id="mobile"]')
    # 5. 找到密码标签
    input_pwd = driver.find_element_by_xpath('//*[@id="mbpwd"]')
    
    # 6. 输入用户名
    input_user.send_keys('13121758648')
    # 7. 输入密码
    input_pwd.send_keys('woshiniba')
    
    
    # 8. 点击登录按钮
    input_submit = driver.find_element_by_xpath(
        '//*[@id="footer-band"]/div[5]/div/div/div[1]/div[2]/div[4]/div[2]/div/span[1]')
    input_submit.click()
    
    print(driver.get_cookies())
    
    # # 9. 点击跳转
    # news = driver.find_element_by_xpath('//*[@id="newsContent20646261"]/div[1]/a[1]')
    # # news.click()
    # driver.execute_script("arguments[0].click();", news)
    
    # 10.管理浏览器
    # driver.close()
    

      

      

    二、破解滑动验证码

      WuSir为我们带来的精彩的讲解,从__main__的主函数调用开始,先讲了图片的截取和距离的测算,接下来分析了怎么模拟人类行为的滑动过程,通过速度和加速度的空值实现,而且会故意制造匹配之后的小幅振动行为,最后点击确定就可以破解该验证码,重点是像素的选择和速度的调节,感谢!

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    import os
    import shutil
    from PIL import Image
    import time
    
    
    def get_snap(driver):
        driver.save_screenshot('full_snap.png')
        page_snap_obj = Image.open('full_snap.png')
    
        return page_snap_obj
    
    
    def get_image(driver):
        img = driver.find_element_by_class_name('geetest_canvas_img')
        time.sleep(2)
        location = img.location
        size = img.size
    
        left = location['x']
        top = location['y']
        right = left + size['width']
        bottom = top + size['height']
    
        page_snap_obj = get_snap(driver)
    
        image_obj = page_snap_obj.crop((left * 2, top * 2, right * 2, bottom * 2))
        # image_obj.show()
        with open('code.png', 'wb') as f:
            image_obj.save(f, format='png')
        return image_obj
    
    
    def get_distance(image1, image2):
        # start = 0
        # threhold = 70
        # for i in range(start, image1.size[0]):
        #     for j in range(0, image1.size[1]):
        #         rgb1 = image1.load()[i, j]
        #         rgb2 = image2.load()[i, j]
        #         res1 = abs(rgb1[0] - rgb2[0])
        #         res2 = abs(rgb1[1] - rgb2[1])
        #         res3 = abs(rgb1[2] - rgb2[2])
        #         # print(res1,res2,res3)
        #         if not (res1 < threhold and res2 < threhold and res3 < threhold):
        #             print(111111, i, j)
        #             return i - 13
        # print(2222, i, j)
        # return i - 13
        start = 0
        threhold = 70
        v = []
        for i in range(start, image1.size[0]):
            for j in range(0, image1.size[1]):
                rgb1 = image1.load()[i, j]
                rgb2 = image2.load()[i, j]
                res1 = abs(rgb1[0] - rgb2[0])
                res2 = abs(rgb1[1] - rgb2[1])
                res3 = abs(rgb1[2] - rgb2[2])
    
                if not (res1 < threhold and res2 < threhold and res3 < threhold):
                    print(i)
                    if i not in v:
                        v.append(i)
    
        stop = 0
        for i in range(0, len(v)):
            val = i + v[0]
            if v[i] != val:
                stop = v[i]
                break
    
        width = stop - v[0]
        print(stop, v[0], width)
        return width
    
    
    def get_tracks(distance):
        import random
        exceed_distance = random.randint(0, 5)
        distance += exceed_distance  # 先滑过一点,最后再反着滑动回来
        v = 0
        t = 0.2
        forward_tracks = []
    
        current = 0
        mid = distance * 3 / 5
        while current < distance:
            if current < mid:
                a = random.randint(1, 3)
            else:
                a = random.randint(1, 3)
                a = -a
            s = v * t + 0.5 * a * (t ** 2)
            v = v + a * t
            current += s
            forward_tracks.append(round(s))
    
        # 反着滑动到准确位置
        v = 0
        t = 0.2
        back_tracks = []
    
        current = 0
        mid = distance * 4 / 5
        while abs(current) < exceed_distance:
            if current < mid:
                a = random.randint(1, 3)
            else:
                a = random.randint(-3, -5)
                a = -a
            s = -v * t - 0.5 * a * (t ** 2)
            v = v + a * t
            current += s
            back_tracks.append(round(s))
        return {'forward_tracks': forward_tracks, 'back_tracks': list(reversed(back_tracks))}
    
    
    def crack(driver):  # 破解滑动认证
        # 1、点击按钮,得到没有缺口的图片
        button = driver.find_element_by_xpath('//*[@id="embed-captcha"]/div/div[2]/div[1]/div[3]')
        button.click()
    
        # 2、获取没有缺口的图片
        image1 = get_image(driver)
    
        # 3、点击滑动按钮,得到有缺口的图片
        button = driver.find_element_by_class_name('geetest_slider_button')
        button.click()
    
        # 4、获取有缺口的图片
        image2 = get_image(driver)
    
        # 5、对比两种图片的像素点,找出位移
        distance = get_distance(image1, image2)
        print(distance)
        #
        # 6、模拟人的行为习惯,根据总位移得到行为轨迹
        tracks = get_tracks(int(distance / 2))
    
        # 7、按照行动轨迹先正向滑动,后反滑动
        button = driver.find_element_by_class_name('geetest_slider_button')
        ActionChains(driver).click_and_hold(button).perform()
    
        # 正常人类总是自信满满地开始正向滑动,自信地表现是疯狂加速
        for track in tracks['forward_tracks']:
            ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()
    
        # 结果傻逼了,正常的人类停顿了一下,回过神来发现,卧槽,滑过了,然后开始反向滑动
        time.sleep(0.5)
        for back_track in tracks['back_tracks']:
            ActionChains(driver).move_by_offset(xoffset=back_track, yoffset=0).perform()
        #
        # # 小范围震荡一下,进一步迷惑极验后台,这一步可以极大地提高成功率
        ActionChains(driver).move_by_offset(xoffset=3, yoffset=0).perform()
        ActionChains(driver).move_by_offset(xoffset=-3, yoffset=0).perform()
    
        # # 成功后,骚包人类总喜欢默默地欣赏一下自己拼图的成果,然后恋恋不舍地松开那只脏手
        time.sleep(0.5)
        ActionChains(driver).release().perform()
    
    
    def login_luffy(username, password):
        driver = webdriver.Chrome('/Users/wupeiqi/drivers/chromedriver')
        driver.set_window_size(960, 800)
        try:
            # 1、输入账号密码回车
            driver.implicitly_wait(3)
            driver.get('https://www.luffycity.com/login')
    
            input_username = driver.find_element_by_xpath('//*[@id="router-view"]/div/div/div[2]/div[2]/input[1]')
            input_pwd = driver.find_element_by_xpath('//*[@id="router-view"]/div/div/div[2]/div[2]/input[2]')
    
            input_username.send_keys(username)
            input_pwd.send_keys(password)
    
            # 2、破解滑动认证
            crack(driver)
    
            time.sleep(10)  # 睡时间长一点,确定登录成功
        finally:
            pass
            # driver.close()
    
    
    if __name__ == '__main__':
        login_luffy(username='wupeiqi', password='123123123')
    

      

    三:总结

                  前半段的直播都是由咸湿的,哦不对、是亲切的Alex老师为我们分享了关于职场方面的一些东西,尤其是咸湿的,哦不对、是亲切的Alex老师用他曾经的经历来讲述这些东西,这些经验和思想,听完后对大家讨论得都很热烈,挺受启发的。

        通过学习selenium模块,使得部分对于很复杂的爬虫,用selenium做起来还是比较方便的。但如果使用selenium模块的话,对于爬虫程序可以说基本毫无性能可言,一般的解决方案可以通过selenium + 其它模块一起配合使用来相互弥补。最后,WuSir通过selenium 和 PIL模块一起配合使用,破解了极验的滑动验证码、但此方式有个大问题,只能处理简单的图片,对于复杂的图片命中率会不高,面对更加复杂的验证码只能通过打码平台来解决了。

  • 相关阅读:
    JS中json对象克隆
    jhipster中图片路径打包问题(webpack)
    arcgis for javascript api 4.x 中,使用本地非 4326坐标系绘制功能实现
    spring核心之IOC
    spring基于XML的声明式事务控制
    hibernate之事务处理
    hibernate之一级缓存
    hibernate之一对多,多对一
    hibernate之HQL,Criteria与SQL
    spring的基于注解的IOC配置
  • 原文地址:https://www.cnblogs.com/sunday294/p/9284953.html
Copyright © 2020-2023  润新知