模拟登陆微博相对来说,并不难。验证码是常规的5个随机数字字母的组合,识别起来也比较容易。主要是用到许多Selenium中的知识,如定位标签、输入信息、点击等。如对Selenium的使用并不熟悉,请先移驾《Python爬虫 | Selenium详解》。相信你再来看本篇一定可以看懂。
破解微博登陆的思路:
(1)使用webdriver打开微博网页;
(2)输入用户名和密码,点击登录;
(3)对第二步的结果进行判断
- 情况一:用户名或者密码错误
- 情况二:登录成功
- 情况三:出现验证码图片,需识别
- 情况四:其他错误
(4)本例中增加了登录成功后获得cookies的情况
import requests from requests import RequestException from selenium import webdriver from selenium.common.exceptions import NoSuchElementException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from chaojiying import Chaojiying # 超级鹰用户名、密码、软件ID、 CHAOJIYING_USERNAME = CHAOJIYING_PASSWORD = CHAOJIYING_SOFT_ID = CHAOJIYING_KIND = 1006 class LoginWeibo(): def __init__(self, username, password): self.url = 'https://www.weibo.com' self.browser = webdriver.Chrome(executable_path='D:downloadpythonRelatedchromedriver.exe') self.wait = WebDriverWait(self.browser, 20) self.username = username self.password = password self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID) # def __del__(self): # self.browser.close() def open(self): """ 打开网页输入用户名密码 :return: None """ self.browser.get(self.url) username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginname'))) password = self.wait.until(EC.presence_of_element_located((By.NAME, 'password'))) username.send_keys(self.username) password.send_keys(self.password) def get_click_button(self): ''' 找到登录按钮 :return: ''' ''' <a href="javascript:void(0)" class="W_btn_a btn_32px " action-type="btn_submit" node-type="submitBtn" suda-data="key=tblog_weibologin3&value=click_sign" tabindex="6"><span node-type="submitStates">登录</span></a> ''' button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'W_btn_a'))) return button def login_successfully(self): """ 判断登陆是否成功 :return: """ ''' 登录成功才能看到 <em class="W_ficon ficon_mail S_ficon">I</em> ''' try: return bool( WebDriverWait(self.browser, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.ficon_mail'))) ) except TimeoutException: return False def get_click_image(self, name='captcha.png'): """ 获取验证码图片 :param name: :return: 图片对象 """ try: ''' <img width="95" height="34" action-type="btn_change_verifycode" node-type="verifycode_image" src="https://login.sina.com.cn/cgi/pin.php?r=88815771&s=0&p=gz-66c0488ef9191010d88bea8c9f3a09fdf3bf"> ''' element = self.wait.until( EC.presence_of_element_located((By.XPATH, '//img[@action-type="btn_change_verifycode"]'))) image_url = element.get_attribute('src') image = get_html(image_url).content with open(name, 'wb') as f: f.write(image) return image except NoSuchElementException: print('') return None def password_error(self): """ 判断是否密码错误 :return: """ try: element = WebDriverWait(self.browser, 5).until( EC.presence_of_element_located((By.XPATH, '//div[@class="W_layer W_layer_pop"]/div/p/span[2]'))) print(element.text) if element.text == '用户名或密码错误。': return True except TimeoutException: return False def get_cookies(self): """ 获取Cookies :return: """ print(self.browser.get_cookies()) return self.browser.get_cookies() def login(self): # 1. 打开网址 输入用户名和密码 self.open() # 2. 点击登录按钮 button = self.get_click_button() button.click() if self.password_error(): print('用户名或密码错误') return { 'status': 2, 'content': '用户名或密码错误' } if self.login_successfully(): print('登录成功') # 获取帐号对应的cookies cookies = self.get_cookies() return { 'status': 1, 'content': cookies } else: # 有时会需要验证码 # 获取验证码图片 image = self.get_click_image() # 识别验证码 result = self.chaojiying.post_pic(image, CHAOJIYING_KIND) print(result) # 输入验证码 ''' <input type="text" class="W_input " maxlength="6" autocomplete="off" value="验证码" action-data="text=请输入验证码" action-type="text_copy" name="verifycode" node-type="verifycode" tabindex="3"> ''' verifycode = self.wait.until(EC.presence_of_element_located((By.NAME, 'verifycode'))) verifycode.send_keys(result['pic_str']) # 点击登录按钮 button = self.get_click_button() button.click() if self.login_successfully(): print('登录成功') # 获取帐号对应的cookies cookies = self.get_cookies() return { 'status': 1, 'content': cookies } else: self.chaojiying.report_error(result['pic_id']) self.login() # return { # 'status': 3, # 'content': '登录失败' # } def get_html(url): try: # 添加User-Agent,放在headers中,伪装成浏览器 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: response.encoding = response.apparent_encoding return response return None except RequestException: return None if __name__ == '__main__': result = LoginWeibo('username', 'password').login()
本篇博文仅供学习交流相关的爬虫知识,请勿过度使用,如有任何纠纷,与本人无关。(瑟瑟发抖)