《python3网络爬虫开发实战》--验证码的识别

1.图形验证码：

中国知网：http://my.cnki.net/elibRegister/CommonRegister.aspx

 1 import tesserocr
 2 from PIL import Image
 3 
 4 image = Image.open('code2.jpg')
 5 image = image.convert('L')
 6 threshold = 180
 7 table = []
 8 for i in range(256):
 9     if i < threshold:
10         table.append(0)
11     else:
12         table.append(1)
13 
14 image = image.point(table, '1')
15 #image = image.convert('1')
16 #image.show()
17 
18 result = tesserocr.image_to_text(image)
19 print(result)

2. 极验滑动验证码的识别

https://www.geetest.com/Sensebot

对于应用了极验验证码的网站如果我们直接模拟表单提交，加密参数的构造是个问题，需要分析其加密和校验逻辑，相对烦琐。所以我们采用直接模拟浏览器动作的方式来完成验证。

可以使用 Selenium来完全模拟人的行为的方式来完成验证，此验证成本相比直接去识别加密算法少很多。

https://account.geetest.com/login

(I)模拟点击验证按钮。

(2)识别附动缺口的位置。

(3)模拟拖动滑块。

  1 import time
  2 from io import BytesIO
  3 from PIL import Image
  4 from selenium import webdriver
  5 from selenium.webdriver import ActionChains
  6 from selenium.webdriver.common.by import By
  7 from selenium.webdriver.support.ui import WebDriverWait
  8 from selenium.webdriver.support import expected_conditions as EC
  9 
 10 EMAIL = 'zcs@163.com'
 11 PASSWORD = '123'
 12 BORDER = 6
 13 #INIT_LEFT = 60
 14 
 15 
 16 class CrackGeetest():
 17     def __init__(self):
 18         self.url = 'https://account.geetest.com/login'
 19         self.browser = webdriver.Chrome()
 20         self.wait = WebDriverWait(self.browser, 20)
 21         self.email = EMAIL
 22         self.password = PASSWORD
 23 
 24     def __del__(self):
 25         self.browser.close()
 26 
 27     def get_geetest_button(self):
 28         """
 29         获取初始验证按钮
 30         :return:
 31         """
 32         button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
 33         return button
 34 
 35     def get_position(self):
 36         """
 37         获取验证码位置
 38         :return: 验证码位置元组
 39         """
 40         img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))
 41         time.sleep(2)
 42         location = img.location
 43         size = img.size
 44         top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
 45             'width']
 46         return (top, bottom, left, right)
 47 
 48     def get_screenshot(self):
 49         """
 50         获取网页截图
 51         :return: 截图对象
 52         """
 53         screenshot = self.browser.get_screenshot_as_png()
 54         screenshot = Image.open(BytesIO(screenshot))
 55         return screenshot
 56 
 57     def get_slider(self):
 58         """
 59         获取滑块
 60         :return: 滑块对象
 61         """
 62         slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))
 63         return slider
 64 
 65     def get_geetest_image(self, name='captcha.png'):
 66         """
 67         获取验证码图片
 68         :return: 图片对象
 69         """
 70         top, bottom, left, right = self.get_position()
 71         print('验证码位置', top, bottom, left, right)
 72         screenshot = self.get_screenshot()
 73         # crop将图片裁剪
 74         captcha = screenshot.crop((left, top, right, bottom))
 75         captcha.save(name)
 76         return captcha
 77 
 78     def open(self):
 79         """
 80         打开网页输入用户名密码
 81         :return: None
 82         """
 83         self.browser.get(self.url)
 84         email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
 85         password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
 86         email.send_keys(self.email)
 87         password.send_keys(self.password)
 88 
 89     def get_gap(self, image1, image2):
 90         """
 91         获取缺口偏移量
 92         :param image1: 不带缺口图片
 93         :param image2: 带缺口图片
 94         :return:
 95         """
 96         left = 60
 97         for i in range(left, image1.size[0]):
 98             for j in range(image1.size[1]):
 99                 if not self.is_pixel_equal(image1, image2, i, j):
100                     left = i
101                     return left
102         return left
103 
104     def is_pixel_equal(self, image1, image2, x, y):
105         """
106         判断两个像素是否相同
107         :param image1: 图片1
108         :param image2: 图片2
109         :param x: 位置x
110         :param y: 位置y
111         :return: 像素是否相同
112         """
113         # 取两个图片的像素点
114         pixel1 = image1.load()[x, y]
115         pixel2 = image2.load()[x, y]
116         threshold = 60
117         if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
118                 pixel1[2] - pixel2[2]) < threshold:
119             return True
120         else:
121             return False
122 
123     def get_track(self, distance):
124         """
125         根据偏移量获取移动轨迹
126         :param distance: 偏移量
127         :return: 移动轨迹
128         """
129         # 移动轨迹
130         track = []
131         # 当前位移
132         current = 0
133         # 减速阈值
134         mid = distance * 4 / 5
135         # 计算间隔
136         t = 0.2
137         # 初速度
138         v = 0
139 
140         while current < distance:
141             if current < mid:
142                 # 加速度为正2
143                 a = 2
144             else:
145                 # 加速度为负3
146                 a = -3
147             # 初速度v0
148             v0 = v
149             # 当前速度v = v0 + at
150             v = v0 + a * t
151             # 移动距离x = v0t + 1/2 * a * t^2
152             move = v0 * t + 1 / 2 * a * t * t
153             # 当前位移
154             current += move
155             # 加入轨迹
156             track.append(round(move))
157         return track
158 
159     def move_to_gap(self, slider, track):
160         """
161         拖动滑块到缺口处
162         :param slider: 滑块
163         :param track: 轨迹
164         :return:
165         """
166         ActionChains(self.browser).click_and_hold(slider).perform()
167         for x in track:
168             ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()
169         time.sleep(0.5)
170         ActionChains(self.browser).release().perform()
171 
172     def login(self):
173         """
174         登录
175         :return: None
176         """
177         submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))
178         submit.click()
179         time.sleep(10)
180         print('登录成功')
181 
182     def crack(self):
183         # 输入用户名密码
184         self.open()
185         # 点击验证按钮
186         button = self.get_geetest_button()
187         button.click()
188         # 获取验证码图片
189         image1 = self.get_geetest_image('captcha1.png')
190         # 点按呼出缺口
191         slider = self.get_slider()
192         slider.click()
193         # 获取带缺口的验证码图片
194         image2 = self.get_geetest_image('captcha2.png')
195         # 获取缺口位置
196         gap = self.get_gap(image1, image2)
197         print('缺口位置', gap)
198         # 减去缺口位移
199         gap -= BORDER
200         # 获取移动轨迹
201         track = self.get_track(gap)
202         print('滑动轨迹', track)
203         # 拖动滑块
204         self.move_to_gap(slider, track)
205 
206         success = self.wait.until(
207             EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))
208         print(success)
209 
210         # 失败后重试
211         if not success:
212             self.crack()
213         else:
214             self.login()
215 
216 
217 if __name__ == '__main__':
218     crack = CrackGeetest()
219     crack.crack()

但是，当我们截取图片的时候，网站将图片分割为不同的图片随机组合，我们就无法使用这一方法。

3.点触验证码的识别

点触的网址挂了，

4. 微博宫格识别

相关阅读:
101. Symmetric Tree（js）
100. Same Tree（js）
99. Recover Binary Search Tree（js）
98. Validate Binary Search Tree（js）
97. Interleaving String（js）
96. Unique Binary Search Trees（js）
95. Unique Binary Search Trees II（js）
94. Binary Tree Inorder Traversal（js）
93. Restore IP Addresses（js）
92. Reverse Linked List II（js）
原文地址：https://www.cnblogs.com/chengchengaqin/p/9655270.html