• python爬取豆瓣登陆验证码


    先写一个爬取图片的方法

    # -*- coding: utf-8 -*-
    from urllib.request import Request
    from urllib.request import urlopen
    
    #添加模拟浏览器协议头
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    
    url = 'https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1510537362&di=3f1f93bb6bf35c7724e3b5c435528187&src=http://www.zhlzw.com/UploadFiles/Article_UploadFiles/201204/20120412123921838.jpg'
    
    def getImg(url,imgName):
    	try:
    		req_timeout = 5
    		req = Request(url=url,headers=headers)
    		f = urlopen(req,None,req_timeout)
    		pic = f.read()
    		#pic= Request.get(url, timeout=10)
    		imgPath = './imgs/%s.jpg'%(imgName)
    		fp=open(imgPath,'wb')
    		fp.write(pic)
    		fp.close()
    	except Request.exceptions.ConnectionError:
    		print(u'链接失败')
    ##再写一个爬去豆瓣登录页面的代码,并调用上述所写的方法
    import http.cookiejar
    import urllib.request
    
    from lxml import etree
    
    from spiderImg import getImg
    
    head = {
        'Connection': 'Keep-Alive',
        'Accept': 'text/html, application/xhtml+xml, */*',
        'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    }
    def makeMyOpener(head):
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        header = []
        for key, value in head.items():
            elem = (key, value)
            header.append(elem)
        opener.addheaders = header
        return opener
    
    oper = makeMyOpener(head)
    uop = oper.open('https://accounts.douban.com/login', timeout = 1000)
    data = uop.read()
    html = data.decode()
    '''
    spath = './doubanLogin.html'
    f=open(spath,"w",encoding='utf-8')
    f.write(html)
    f.close()
    '''
    #print(html)
    #lxml提取
    #得到验证码
    selector = etree.HTML(html)
    links = selector.xpath('//img[@id="captcha_image"]/@src')
    for link in links:
    	print(link)
    	getImg(link,'captcs')
    
    #得到验证码id
    captcha_ids = selector.xpath('//input[@name="captcha-id"]/@value')
    captcha_id = 0
    for cid in captcha_ids:
    	captcha_id = cid
    	print(captcha_id)
    
    
    captcha=input("请输入验证码:")
    print(captcha)
    
    url = 'https://accounts.douban.com/login'
    datas = {'source':'index_nav','redir':'https://www.douban.com/','form_email':'你的账号','form_password':'你的密码','captcha-solution':captcha,'captcha-id':captcha_id}
    data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8')
    response = oper.open(url, data_encoded)
    content = response.read()
    html = content.decode()
    #print(html)
    
    spath = './douban.html'
    f=open(spath,"w",encoding='utf-8')
    f.write(html)
    f.close()
    

      


      
  • 相关阅读:
    textArea打印时,内容不显示
    自定义Metadata验证属性
    C# 扩展类与分布类
    JSON基础 JS操作JSON总结
    如何查看别人公众号的粉丝量
    Powerdesigner逆向工程从sql server数据库生成pdm
    springMVC中前台ajax传json数据后台controller接受对象为null
    Mybatis报错: Invalid bound statement (not found)
    Mysql批量插入数据性能问题
    java中String编码转换 UTF-8转GBK
  • 原文地址:https://www.cnblogs.com/yongxinboy/p/7841053.html
Copyright © 2020-2023  润新知