• python爬虫之requests模块2


    python网络爬虫之requests模块
    • session处理cookie
    • proxies参数设置请求代理ip
    • 基于线程池的数据爬取
    一 获取验证码

    步骤:

    ​ 1 注册云大码 http://www.yundama.com/about.html

    1551528358914

    ​ 2登录 开发者登录 和用户者登录

    1551528609826

    3 在开发者登录成功之后,创建新软件

    1551528809987

    4点击开发者中心

    1551529001185

    5 点击进入Pythonhttp下载

    1551529128722

    6 选择所需的版本下载即可

    获取验证码

    import http.client, mimetypes, urllib, json, time, requests
    
    ######################################################################
    
    class YDMHttp:
    
        apiurl = 'http://api.yundama.com/api.php'
        username = ''
        password = ''
        appid = ''
        appkey = ''
    
        def __init__(self, username, password, appid, appkey):
            self.username = username  
            self.password = password
            self.appid = str(appid)
            self.appkey = appkey
    
        def request(self, fields, files=[]):
            response = self.post_url(self.apiurl, fields, files)
            response = json.loads(response)
            return response
        
        def balance(self):
            data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['balance']
            else:
                return -9001
        
        def login(self):
            data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['uid']
            else:
                return -9001
    
        def upload(self, filename, codetype, timeout):
            data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
            file = {'file': filename}
            response = self.request(data, file)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['cid']
            else:
                return -9001
    
        def result(self, cid):
            data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
            response = self.request(data)
            return response and response['text'] or ''
    
        def decode(self, filename, codetype, timeout):
            cid = self.upload(filename, codetype, timeout)
            if (cid > 0):
                for i in range(0, timeout):
                    result = self.result(cid)
                    if (result != ''):
                        return cid, result
                    else:
                        time.sleep(1)
                return -3003, ''
            else:
                return cid, ''
    
        def report(self, cid):
            data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
            response = self.request(data)
            if (response):
                return response['ret']
            else:
                return -9001
    
        def post_url(self, url, fields, files=[]):
            for key in files:
                files[key] = open(files[key], 'rb');
            res = requests.post(url, files=files, data=fields)
            return res.text
    
    ######################################################################
    
    # 云打码中的用户名(普通用户)
    username    = 'molihua'
    
    # 云打码中的密码
    password    = 'MLH19960208'                            
    
    # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appid       = 7025                                  
    
    # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appkey      = '2d96c723a682c882faa73257e98440d7 '    
    
    # 图片文件  
    filename    = 'getimage.jpg'                        
    
    # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype    = 1004
    
    # 超时时间,秒 自定义
    timeout     = 10                                   
    
    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)
    
        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)
    
        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)
    
        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
    
    需求 通过登录人人网来获取主页面
    import requests
    import urllib
    from lxml import etree
    
    #获取session对象
    session=requests.Session()
    url='http://www.renren.com'
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0'
    }
    renren_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(renren_text)
    code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    urllib.requests.urlretrieve(url=code_img_url,filename='code.jpg')
    
    #识别验证码图片中的数据值
    
    code_data = getCodeDate('15204558261','MLH19960208','./code.jpg',2004)
    print(code_data)
    
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019142013687'
    
    ##通过抓包工具来获取
    data ={
        'email':'15204558261'
        'icode': code_data
        'origURL':'http://www.renren.com/home'
        'domain':'renren.com'
        'key_id':'1'
        'captcha_type':'web_login'
        'password':'7bf638cc5b01b15b9416bf17fb98a1eda46da861c139b563a4c670fb21884336'
        'rkey':'cf9180c5afba43cb1f089b953e67b567'
        'f':'http%3A%2F%2Fwww.renren.com%2F296856777%2Fprofile'
    }
    #该次请求产生的cookie会被自动存储到session对象中
    
    session.post(url=login_url,data=data,headers=headers)
    url='http://www.renren.com/296856777/profile'
    page_text = session.get(url=url,headers=headers).text
    
    with open('renren.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    
    基于multiprocessing.dummy线程池的数据爬取
    import requests
    import random
    from lxml import etree
    import re
    from fake_useragent import UserAgent
    #安装fake-useragent库:pip install fake-useragent
    #导入线程池模块
    from multiprocessing.dummy import Pool
    #实例化线程池对象
    pool = Pool()
    url = 'http://www.pearvideo.com/category_1'
    #随机产生UA
    ua = UserAgent().random
    headers = {
        'User-Agent':ua
    }
    #获取首页页面数据
    page_text = requests.get(url=url,headers=headers).text
    #对获取的首页页面数据中的相关视频详情链接进行解析
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
    
    detail_urls = []#存储二级页面的url
    for li in li_list:
        detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        title = li.xpath('.//div[@class="vervideo-title"]/text()')[0]
        detail_urls.append(detail_url)
        
    vedio_urls = []#存储视频的url
    for url in detail_urls:
        page_text = requests.get(url=url,headers=headers).text
        vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0]
        vedio_urls.append(vedio_url) 
    #使用线程池进行视频数据下载    
    func_request = lambda link:requests.get(url=link,headers=headers).content
    video_data_list = pool.map(func_request,vedio_urls)
    #使用线程池进行视频数据保存
    func_saveData = lambda data:save(data)
    pool.map(func_saveData,video_data_list)
    def save(data):
        fileName = str(random.randint(1,10000))+'.mp4'
        with open(fileName,'wb') as fp:
            fp.write(data)
            print(fileName+'已存储')
            
    pool.close()
    pool.join()
    
    requests模块的代理
    什么是代理???

    ​ 代理就是第三方代替本体来处理相关的事务。列如:中介,微商,代购等

    那么问题来了,爬虫为什么要用到代理呢?

    ​ 这是因为有一些网站它会采取相关的反爬措施。列如一些网站会通过检测某一段时间某个IP访问的次数,如果访问太过于频繁,那么它就会知道可能不是正常的用户,会禁止掉这个IP的访问。所以我们可以使用代理IP来爬取我们需要的数据,就算是某一个IP一段时间被禁止掉,也可以换另一个代理

    IP去爬取数据。是不是觉得非常的神奇呢

    代理的分类:

    正向代理:代理客户端获取数据

    反向代理:代理服务端提供数据

    免费代理IP的网站:

    http://www.goubanjia.com/

    快代理

    import requests
    import random
    if __name__ == "__main__":
        #不同浏览器的UA
        header_list = [
            # 遨游
            {"user-agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
            # 火狐
            {"user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
            # 谷歌
            {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
        ]
        #不同的代理IP
        proxy_list = [
            {"http": "112.115.57.20:3128"},
            {'http': '121.41.171.223:3128'}
        ]
        #随机获取UA和代理IP
        header = random.choice(header_list)
        proxy = random.choice(proxy_list)
    
        url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
        #参数3:设置代理
        response = requests.get(url=url,headers=header,proxies=proxy)
        response.encoding = 'utf-8'
        
        with open('daili.html', 'wb') as fp:
            fp.write(response.content)
        #切换成原来的IP
        requests.get(url, proxies={"http": ""})
    
  • 相关阅读:
    shell脚本 加密备份MySQL数据库
    C#在Linux下获取文件夹信息(所在磁盘总大小,使用空间,已用空间,使用率)
    bootstrap--- 两种bootstrap multiselect组件大比拼
    C# 文件重命名
    C#中一些常用的正则表达式
    C# 文件压缩加解密
    Python 由__dict__和dir()引发的一些思考
    python3随机生成中文字符
    Django自定义过滤器中is_safe和need_autoescape两个参数的理解
    Python格式化字符串--format
  • 原文地址:https://www.cnblogs.com/mlhz/p/10462811.html
Copyright © 2020-2023  润新知