1.图形验证码:
中国知网:http://my.cnki.net/elibRegister/CommonRegister.aspx
1 import tesserocr 2 from PIL import Image 3 4 image = Image.open('code2.jpg') 5 image = image.convert('L') 6 threshold = 180 7 table = [] 8 for i in range(256): 9 if i < threshold: 10 table.append(0) 11 else: 12 table.append(1) 13 14 image = image.point(table, '1') 15 #image = image.convert('1') 16 #image.show() 17 18 result = tesserocr.image_to_text(image) 19 print(result)
2. 极验滑动验证码的识别
https://www.geetest.com/Sensebot
对于应用了极验验证码的网站如果我们直接模拟表单提交,加密参数的构造是个问题,需要分析其加密和校验逻辑,相对烦琐 。 所以我们采用直接模拟浏览器动作的方式来完成验证 。
可以使用 Selenium来完全模拟人的行为的方式来完成验证,此验证成本相比直接去识别加密算法少很多 。
https://account.geetest.com/login
(I)模拟点击验证按钮。
(2)识别附动缺口的位置 。
(3)模拟拖动滑块 。
1 import time 2 from io import BytesIO 3 from PIL import Image 4 from selenium import webdriver 5 from selenium.webdriver import ActionChains 6 from selenium.webdriver.common.by import By 7 from selenium.webdriver.support.ui import WebDriverWait 8 from selenium.webdriver.support import expected_conditions as EC 9 10 EMAIL = 'zcs@163.com' 11 PASSWORD = '123' 12 BORDER = 6 13 #INIT_LEFT = 60 14 15 16 class CrackGeetest(): 17 def __init__(self): 18 self.url = 'https://account.geetest.com/login' 19 self.browser = webdriver.Chrome() 20 self.wait = WebDriverWait(self.browser, 20) 21 self.email = EMAIL 22 self.password = PASSWORD 23 24 def __del__(self): 25 self.browser.close() 26 27 def get_geetest_button(self): 28 """ 29 获取初始验证按钮 30 :return: 31 """ 32 button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip'))) 33 return button 34 35 def get_position(self): 36 """ 37 获取验证码位置 38 :return: 验证码位置元组 39 """ 40 img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img'))) 41 time.sleep(2) 42 location = img.location 43 size = img.size 44 top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 45 'width'] 46 return (top, bottom, left, right) 47 48 def get_screenshot(self): 49 """ 50 获取网页截图 51 :return: 截图对象 52 """ 53 screenshot = self.browser.get_screenshot_as_png() 54 screenshot = Image.open(BytesIO(screenshot)) 55 return screenshot 56 57 def get_slider(self): 58 """ 59 获取滑块 60 :return: 滑块对象 61 """ 62 slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button'))) 63 return slider 64 65 def get_geetest_image(self, name='captcha.png'): 66 """ 67 获取验证码图片 68 :return: 图片对象 69 """ 70 top, bottom, left, right = self.get_position() 71 print('验证码位置', top, bottom, left, right) 72 screenshot = self.get_screenshot() 73 # crop将图片裁剪 74 captcha = screenshot.crop((left, top, right, bottom)) 75 captcha.save(name) 76 return captcha 77 78 def open(self): 79 """ 80 打开网页输入用户名密码 81 :return: None 82 """ 83 self.browser.get(self.url) 84 email = self.wait.until(EC.presence_of_element_located((By.ID, 'email'))) 85 password = self.wait.until(EC.presence_of_element_located((By.ID, 'password'))) 86 email.send_keys(self.email) 87 password.send_keys(self.password) 88 89 def get_gap(self, image1, image2): 90 """ 91 获取缺口偏移量 92 :param image1: 不带缺口图片 93 :param image2: 带缺口图片 94 :return: 95 """ 96 left = 60 97 for i in range(left, image1.size[0]): 98 for j in range(image1.size[1]): 99 if not self.is_pixel_equal(image1, image2, i, j): 100 left = i 101 return left 102 return left 103 104 def is_pixel_equal(self, image1, image2, x, y): 105 """ 106 判断两个像素是否相同 107 :param image1: 图片1 108 :param image2: 图片2 109 :param x: 位置x 110 :param y: 位置y 111 :return: 像素是否相同 112 """ 113 # 取两个图片的像素点 114 pixel1 = image1.load()[x, y] 115 pixel2 = image2.load()[x, y] 116 threshold = 60 117 if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs( 118 pixel1[2] - pixel2[2]) < threshold: 119 return True 120 else: 121 return False 122 123 def get_track(self, distance): 124 """ 125 根据偏移量获取移动轨迹 126 :param distance: 偏移量 127 :return: 移动轨迹 128 """ 129 # 移动轨迹 130 track = [] 131 # 当前位移 132 current = 0 133 # 减速阈值 134 mid = distance * 4 / 5 135 # 计算间隔 136 t = 0.2 137 # 初速度 138 v = 0 139 140 while current < distance: 141 if current < mid: 142 # 加速度为正2 143 a = 2 144 else: 145 # 加速度为负3 146 a = -3 147 # 初速度v0 148 v0 = v 149 # 当前速度v = v0 + at 150 v = v0 + a * t 151 # 移动距离x = v0t + 1/2 * a * t^2 152 move = v0 * t + 1 / 2 * a * t * t 153 # 当前位移 154 current += move 155 # 加入轨迹 156 track.append(round(move)) 157 return track 158 159 def move_to_gap(self, slider, track): 160 """ 161 拖动滑块到缺口处 162 :param slider: 滑块 163 :param track: 轨迹 164 :return: 165 """ 166 ActionChains(self.browser).click_and_hold(slider).perform() 167 for x in track: 168 ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() 169 time.sleep(0.5) 170 ActionChains(self.browser).release().perform() 171 172 def login(self): 173 """ 174 登录 175 :return: None 176 """ 177 submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn'))) 178 submit.click() 179 time.sleep(10) 180 print('登录成功') 181 182 def crack(self): 183 # 输入用户名密码 184 self.open() 185 # 点击验证按钮 186 button = self.get_geetest_button() 187 button.click() 188 # 获取验证码图片 189 image1 = self.get_geetest_image('captcha1.png') 190 # 点按呼出缺口 191 slider = self.get_slider() 192 slider.click() 193 # 获取带缺口的验证码图片 194 image2 = self.get_geetest_image('captcha2.png') 195 # 获取缺口位置 196 gap = self.get_gap(image1, image2) 197 print('缺口位置', gap) 198 # 减去缺口位移 199 gap -= BORDER 200 # 获取移动轨迹 201 track = self.get_track(gap) 202 print('滑动轨迹', track) 203 # 拖动滑块 204 self.move_to_gap(slider, track) 205 206 success = self.wait.until( 207 EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功')) 208 print(success) 209 210 # 失败后重试 211 if not success: 212 self.crack() 213 else: 214 self.login() 215 216 217 if __name__ == '__main__': 218 crack = CrackGeetest() 219 crack.crack()
但是,当我们截取图片的时候,网站将图片分割为不同的图片随机组合,我们就无法使用这一方法。
3.点触验证码的识别
点触的网址挂了,
4. 微博宫格识别