• 爬虫基础二


    响应状态码

    常见的错误代码及错误原因
    状态码说明详情
    100 继续 请求者应当继续提出请求。服务器已收到请求的一部分,正在等待其余部分
    101 切换协议 请求者已要求服务器切换协议,服务器已确认并准备切换
    200 成功 服务器已成功处理了请求
    201 已创建 请求成功并且服务器创建了新的资源
    202 已接受 服务器已接受请求,但尚未处理
    203 非授权信息 服务器已成功处理了请求,但返回的信息可能来自另一个源
    204 无内容 服务器成功处理了请求,但没有返回任何内容
    205 重置内容 服务器成功处理了请求,内容被重置
    206 部分内容 服务器成功处理了部分请求
    300 多种选择 针对请求,服务器可执行多种操作
    301 永久移动 请求的网页已永久移动到新位置,即永久重定向
    302 临时移动 请求的网页暂时跳转到其他页面,即暂时重定向
    303 查看其他位置 如果原来的请求时POST,重定向目标文档应该通过GET提取
    304 未修改 此次请求返回的网页未修改,继续使用上次的资源
    305 使用代理 请求者应该使用代理访问该网页
    307 临时重定向 请求的资源临时从其他位置响应
    400 错误请求 服务器无法解析该请求
    401 未授权 请求没有进行身份验证或验证未通过
    403 禁止访问 服务器拒绝此请求
    404 未找到 服务器找不到请求的网页
    405 方法禁用 服务器禁用了请求中指定的方法
    406 不接受 无法使用请求的内容响应请求的网页
    407 需要代理授权 请求者需要使用代理授权
    408 请求超时 服务器请求超时
    409 冲突 服务器在完成请求时发生冲突
    410 已删除 请求的资源已永久删除
    411 需要有效长度 服务器不接受不含有效长度标头字段的请求
    412 未满足前提条件 服务器未满足请求者在请求中设置的其中一个前提条件
    413 请求实体过大 请求实体过大,超出服务器的处理能力
    414 请求URL过长 请求网址过长,服务器无法处理
    415 不支持类型 请求格式不被请求页面支持
    416 请求范围不符 页面无法提供请求的范围
    417 未满足期望值 服务器未满足期望请求标头字段的要求
    500 服务器内部错误 服务器遇到错误,无法完成请求
    501 未实现 服务器不具备完成请求的功能
    502 错误网关 服务器作为网关或代理,从上游服务器收到无效响应
    503 服务不可用 服务器目前无法使用
    504 网关超时 服务器作为网关或代理,但是没有及时从上游服务器收到请求
    505 HTTP版本不支持 服务器不支持请求中所用的HTTP协议版本

    Requests的高级用法

    文件上传

    import requests
    
    files = {'file':open('favicon.ico','rb')}
    r = requests.post('http://httpbin.org/post',files=files)
    print(r.text)
    View Code

    Cookies

    import requests
    
    r = requests.get('https://www.baidu.com')
    print(r.cookies)
    for key,value in r.cookies.items():
        print(key+'='+value)
    View Code

    会话维持

    import requests
    
    s = requests.Session()
    s.get('http://httpbin.org/cookies/set/number/123456789')
    r = s.get('http://httpbin.org/cookies')
    print(r.text)
    View Code

    SSL证书验证

    import requests
    
    response = requests.get('https://www.12306.cn')
    print(response.status_code)
    View Code

    代理设置

    import requests
    
    proxies = {
        'http':'socks5://user:password@host:port',
        'https':'socks5://user:password@host:port'
    }
    requests.get('https://www.taobao.com',proxies=proxies)
    View Code

    身份验证

    import requests
    
    from requests.auth import HTTPBasicAuth
    r = requests.get('http://localhost:5000',auth=HTTPBasicAuth('username','password'))
    print(r.status_code)
    View Code

    爬取实例

    动态渲染页面爬取

                
    #会自动弹出谷歌浏览器,先跳转到百度,然后在搜索框中输入python,接着跳转到搜索结果页
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    browser = webdriver.Chrome()
    try:
    	browser.get('https://www.baidu.com')
    	input = browser.find_element_by_id('kw')
    	input.send_keys('Python')
    	input.send_keys(Keys.ENTER)
    	wait = WebDriverWait(browser,10)
    	wait.until(EC.presence_of_all_elements_located((By.ID,'content_left')))
    	print(browser.current_url)
    	print(browser.get_cookies())
    	print(browser.page_source)
    finally:
    	browser.close()
                
            

    验证码识别

    • 图形验证码
                
    import tesserocr
    from PIL import Image
    
    image = Image.open('code.jpg')
    result = tesserocr.image_to_text(image)
    print(result)
                
            
    • 极验滑动验证码识别
    import time
    from io import BytesIO
    from PIL import Image
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    EMAIL = 'cqc@cuiqingcai.com'
    PASSWORD = ''
    BORDER = 6
    INIT_LEFT = 60
    
    
    class CrackGeetest():
        def __init__(self):
            self.url = 'https://account.geetest.com/login'
            self.browser = webdriver.Chrome()
            self.wait = WebDriverWait(self.browser, 20)
            self.email = EMAIL
            self.password = PASSWORD
    
        def __del__(self):
            self.browser.close()
    
        def get_geetest_button(self):
            """
            获取初始验证按钮
            :return:
            """
            button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
            return button
    
        def get_position(self):
            """
            获取验证码位置
            :return: 验证码位置元组
            """
            img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))
            time.sleep(2)
            location = img.location
            size = img.size
            top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
                'width']
            return (top, bottom, left, right)
    
        def get_screenshot(self):
            """
            获取网页截图
            :return: 截图对象
            """
            screenshot = self.browser.get_screenshot_as_png()
            screenshot = Image.open(BytesIO(screenshot))
            return screenshot
    
        def get_slider(self):
            """
            获取滑块
            :return: 滑块对象
            """
            slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))
            return slider
    
        def get_geetest_image(self, name='captcha.png'):
            """
            获取验证码图片
            :return: 图片对象
            """
            top, bottom, left, right = self.get_position()
            print('验证码位置', top, bottom, left, right)
            screenshot = self.get_screenshot()
            captcha = screenshot.crop((left, top, right, bottom))
            captcha.save(name)
            return captcha
    
        def open(self):
            """
            打开网页输入用户名密码
            :return: None
            """
            self.browser.get(self.url)
            email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
            password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
            email.send_keys(self.email)
            password.send_keys(self.password)
    
        def get_gap(self, image1, image2):
            """
            获取缺口偏移量
            :param image1: 不带缺口图片
            :param image2: 带缺口图片
            :return:
            """
            left = 60
            for i in range(left, image1.size[0]):
                for j in range(image1.size[1]):
                    if not self.is_pixel_equal(image1, image2, i, j):
                        left = i
                        return left
            return left
    
        def is_pixel_equal(self, image1, image2, x, y):
            """
            判断两个像素是否相同
            :param image1: 图片1
            :param image2: 图片2
            :param x: 位置x
            :param y: 位置y
            :return: 像素是否相同
            """
            # 取两个图片的像素点
            pixel1 = image1.load()[x, y]
            pixel2 = image2.load()[x, y]
            threshold = 60
            if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
                    pixel1[2] - pixel2[2]) < threshold:
                return True
            else:
                return False
    
        def get_track(self, distance):
            """
            根据偏移量获取移动轨迹
            :param distance: 偏移量
            :return: 移动轨迹
            """
            # 移动轨迹
            track = []
            # 当前位移
            current = 0
            # 减速阈值
            mid = distance * 4 / 5
            # 计算间隔
            t = 0.2
            # 初速度
            v = 0
    
            while current < distance:
                if current < mid:
                    # 加速度为正2
                    a = 2
                else:
                    # 加速度为负3
                    a = -3
                # 初速度v0
                v0 = v
                # 当前速度v = v0 + at
                v = v0 + a * t
                # 移动距离x = v0t + 1/2 * a * t^2
                move = v0 * t + 1 / 2 * a * t * t
                # 当前位移
                current += move
                # 加入轨迹
                track.append(round(move))
            return track
    
        def move_to_gap(self, slider, track):
            """
            拖动滑块到缺口处
            :param slider: 滑块
            :param track: 轨迹
            :return:
            """
            ActionChains(self.browser).click_and_hold(slider).perform()
            for x in track:
                ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()
            time.sleep(0.5)
            ActionChains(self.browser).release().perform()
    
        def login(self):
            """
            登录
            :return: None
            """
            submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))
            submit.click()
            time.sleep(10)
            print('登录成功')
    
        def crack(self):
            # 输入用户名密码
            self.open()
            # 点击验证按钮
            button = self.get_geetest_button()
            button.click()
            # 获取验证码图片
            image1 = self.get_geetest_image('captcha1.png')
            # 点按呼出缺口
            slider = self.get_slider()
            slider.click()
            # 获取带缺口的验证码图片
            image2 = self.get_geetest_image('captcha2.png')
            # 获取缺口位置
            gap = self.get_gap(image1, image2)
            print('缺口位置', gap)
            # 减去缺口位移
            gap -= BORDER
            # 获取移动轨迹
            track = self.get_track(gap)
            print('滑动轨迹', track)
            # 拖动滑块
            self.move_to_gap(slider, track)
    
            success = self.wait.until(
                EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))
            print(success)
    
            # 失败后重试
            if not success:
                self.crack()
            else:
                self.login()
    
    
    if __name__ == '__main__':
        crack = CrackGeetest()
        crack.crack()
    View Code
    • 点击验证码的识别
    import time
    from io import BytesIO
    from PIL import Image
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from chaojiying import Chaojiying
    
    EMAIL = 'cqc@cuiqingcai.com'
    PASSWORD = ''
    
    CHAOJIYING_USERNAME = 'Germey'
    CHAOJIYING_PASSWORD = ''
    CHAOJIYING_SOFT_ID = 893590
    CHAOJIYING_KIND = 9102
    
    
    class CrackTouClick():
        def __init__(self):
            self.url = 'http://admin.touclick.com/login.html'
            self.browser = webdriver.Chrome()
            self.wait = WebDriverWait(self.browser, 20)
            self.email = EMAIL
            self.password = PASSWORD
            self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID)
    
        def __del__(self):
            self.browser.close()
    
        def open(self):
            """
            打开网页输入用户名密码
            :return: None
            """
            self.browser.get(self.url)
            email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
            password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
            email.send_keys(self.email)
            password.send_keys(self.password)
    
        def get_touclick_button(self):
            """
            获取初始验证按钮
            :return:
            """
            button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-hod-wrap')))
            return button
    
        def get_touclick_element(self):
            """
            获取验证图片对象
            :return: 图片对象
            """
            element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'touclick-pub-content')))
            return element
    
        def get_position(self):
            """
            获取验证码位置
            :return: 验证码位置元组
            """
            element = self.get_touclick_element()
            time.sleep(2)
            location = element.location
            size = element.size
            top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
                'width']
            return (top, bottom, left, right)
    
        def get_screenshot(self):
            """
            获取网页截图
            :return: 截图对象
            """
            screenshot = self.browser.get_screenshot_as_png()
            screenshot = Image.open(BytesIO(screenshot))
            return screenshot
    
        def get_touclick_image(self, name='captcha.png'):
            """
            获取验证码图片
            :return: 图片对象
            """
            top, bottom, left, right = self.get_position()
            print('验证码位置', top, bottom, left, right)
            screenshot = self.get_screenshot()
            captcha = screenshot.crop((left, top, right, bottom))
            captcha.save(name)
            return captcha
    
        def get_points(self, captcha_result):
            """
            解析识别结果
            :param captcha_result: 识别结果
            :return: 转化后的结果
            """
            groups = captcha_result.get('pic_str').split('|')
            locations = [[int(number) for number in group.split(',')] for group in groups]
            return locations
    
        def touch_click_words(self, locations):
            """
            点击验证图片
            :param locations: 点击位置
            :return: None
            """
            for location in locations:
                print(location)
                ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(), location[0],
                                                                       location[1]).click().perform()
                time.sleep(1)
    
        def touch_click_verify(self):
            """
            点击验证按钮
            :return: None
            """
            button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-pub-submit')))
            button.click()
    
        def login(self):
            """
            登录
            :return: None
            """
            submit = self.wait.until(EC.element_to_be_clickable((By.ID, '_submit')))
            submit.click()
            time.sleep(10)
            print('登录成功')
    
        def crack(self):
            """
            破解入口
            :return: None
            """
            self.open()
            # 点击验证按钮
            button = self.get_touclick_button()
            button.click()
            # 获取验证码图片
            image = self.get_touclick_image()
            bytes_array = BytesIO()
            image.save(bytes_array, format='PNG')
            # 识别验证码
            result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND)
            print(result)
            locations = self.get_points(result)
            self.touch_click_words(locations)
            self.touch_click_verify()
            # 判定是否成功
            success = self.wait.until(
                EC.text_to_be_present_in_element((By.CLASS_NAME, 'touclick-hod-note'), '验证成功'))
            print(success)
    
            # 失败后重试
            if not success:
                self.crack()
            else:
                self.login()
    
    
    if __name__ == '__main__':
        crack = CrackTouClick()
        crack.crack()
    View Code
    • 微博宫格验证码的识别
    import os
    import time
    from io import BytesIO
    from PIL import Image
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from os import listdir
    
    USERNAME = '15874295385'
    PASSWORD = 'fpdpvx119'
    
    TEMPLATES_FOLDER = 'templates/'
    
    
    class CrackWeiboSlide():
        def __init__(self):
            self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/'
            self.browser = webdriver.Chrome()
            self.wait = WebDriverWait(self.browser, 20)
            self.username = USERNAME
            self.password = PASSWORD
    
        def __del__(self):
            self.browser.close()
    
        def open(self):
            """
            打开网页输入用户名密码并点击
            :return: None
            """
            self.browser.get(self.url)
            username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
            password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
            submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
            username.send_keys(self.username)
            password.send_keys(self.password)
            submit.click()
    
        def get_position(self):
            """
            获取验证码位置
            :return: 验证码位置元组
            """
            try:
                img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'patt-shadow')))
            except TimeoutException:
                print('未出现验证码')
                self.open()
            time.sleep(2)
            location = img.location
            size = img.size
            top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
                'width']
            return (top, bottom, left, right)
    
        def get_screenshot(self):
            """
            获取网页截图
            :return: 截图对象
            """
            screenshot = self.browser.get_screenshot_as_png()
            screenshot = Image.open(BytesIO(screenshot))
            return screenshot
    
        def get_image(self, name='captcha.png'):
            """
            获取验证码图片
            :return: 图片对象
            """
            top, bottom, left, right = self.get_position()
            print('验证码位置', top, bottom, left, right)
            screenshot = self.get_screenshot()
            captcha = screenshot.crop((left, top, right, bottom))
            captcha.save(name)
            return captcha
    
        def is_pixel_equal(self, image1, image2, x, y):
            """
            判断两个像素是否相同
            :param image1: 图片1
            :param image2: 图片2
            :param x: 位置x
            :param y: 位置y
            :return: 像素是否相同
            """
            # 取两个图片的像素点
            pixel1 = image1.load()[x, y]
            pixel2 = image2.load()[x, y]
            threshold = 20
            if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
                    pixel1[2] - pixel2[2]) < threshold:
                return True
            else:
                return False
    
        def same_image(self, image, template):
            """
            识别相似验证码
            :param image: 待识别验证码
            :param template: 模板
            :return:
            """
            # 相似度阈值
            threshold = 0.99
            count = 0
            for x in range(image.width):
                for y in range(image.height):
                    # 判断像素是否相同
                    if self.is_pixel_equal(image, template, x, y):
                        count += 1
            result = float(count) / (image.width * image.height)
            if result > threshold:
                print('成功匹配')
                return True
            return False
    
        def detect_image(self, image):
            """
            匹配图片
            :param image: 图片
            :return: 拖动顺序
            """
            for template_name in listdir(TEMPLATES_FOLDER):
                print('正在匹配', template_name)
                template = Image.open(TEMPLATES_FOLDER + template_name)
                if self.same_image(image, template):
                    # 返回顺序
                    numbers = [int(number) for number in list(template_name.split('.')[0])]
                    print('拖动顺序', numbers)
                    return numbers
    
        def move(self, numbers):
            """
            根据顺序拖动
            :param numbers:
            :return:
            """
            # 获得四个按点
            circles = self.browser.find_elements_by_css_selector('.patt-wrap .patt-circ')
            dx = dy = 0
            for index in range(4):
                circle = circles[numbers[index] - 1]
                # 如果是第一次循环
                if index == 0:
                    # 点击第一个按点
                    ActionChains(self.browser) 
                        .move_to_element_with_offset(circle, circle.size['width'] / 2, circle.size['height'] / 2) 
                        .click_and_hold().perform()
                else:
                    # 小幅移动次数
                    times = 30
                    # 拖动
                    for i in range(times):
                        ActionChains(self.browser).move_by_offset(dx / times, dy / times).perform()
                        time.sleep(1 / times)
                # 如果是最后一次循环
                if index == 3:
                    # 松开鼠标
                    ActionChains(self.browser).release().perform()
                else:
                    # 计算下一次偏移
                    dx = circles[numbers[index + 1] - 1].location['x'] - circle.location['x']
                    dy = circles[numbers[index + 1] - 1].location['y'] - circle.location['y']
    
        def crack(self):
            """
            破解入口
            :return:
            """
            self.open()
            # 获取验证码图片
            image = self.get_image('captcha.png')
            numbers = self.detect_image(image)
            self.move(numbers)
            time.sleep(10)
            print('识别结束')
    
    
    if __name__ == '__main__':
        crack = CrackWeiboSlide()
        crack.crack()
    View Code
    pyspider框架:链接 scrapy框架:链接
  • 相关阅读:
    《大话数据结构》第1章 数据结构绪论 1.2 你数据结构怎么学的?
    伍迷七八月新浪微博集锦
    《大话数据结构》第9章 排序 9.7 堆排序(下)
    《大话数据结构》第3章 线性表 3.8.2 单链表的删除
    《大话数据结构》第9章 排序 9.5 直接插入排序
    《大话数据结构》第9章 排序 9.8 归并排序(上)
    《大话数据结构》第2章 算法基础 2.9 算法的时间复杂度
    《大话数据结构》第1章 数据结构绪论 1.1 开场白
    《大话数据结构》第9章 排序 9.1 开场白
    [AWS] Assign a public IP address to an EC2 instance after launched
  • 原文地址:https://www.cnblogs.com/caozhenghua/p/12006902.html
Copyright © 2020-2023  润新知