• 爬虫——requests.get爬虫模块参数


    地址和请求头参数--url和header

    res = requests.get(url,headers=headers)  向网站发起请求,并获取响应对象

    参数

    • url :需要抓取的URL地址
    • headers : 请求头
    • timeout : 超时时间,超过时间会抛出异常

    响应对象(res)属性

    • encoding :响应字符编码 res.encoding = 'utf-8'
    • text :字符串 网站源码
    • content :字节流 字符串网站源码
    • status_code :HTTP响应码
    • url :实际数据的URL地址
    import requests
    
    url = 'http://www.baidu.com/'    # 爬取百度网页
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 
        (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
    
    res = requests.get(url, headers=headers)
    print(res.encoding)     # 查看网站的编码格式 ISO-8859-1
    
    # text属性获取响应内容(字符串)网站源码
    res = requests.get(url,headers=headers)
    res.encoding = 'utf-8'
    html = res.text
    
    # content属性获取响应内容(字节串 bytes)网站源码
    res = requests.get(url,headers=headers)
    html = res.content.decode('utf-8')
    
    print(res.status_code)      # 查看响应码 200
    print(res.url)              # 查看访问的URL地址 https://www.baidu.com/

    非结构化数据的保存方式

    像压缩文件zip、图片文件等都可以使用非结构化数据的保存方式

    with open('xxx.jpg','wb') as f:
        f.write(res.content)

    示例:保存赵丽颖图片到本地

    import requests
    
    url = 'http://dingyue.nosdn.127.net/lL1JH2YdpAWrzEhfp8BrJ8lTHa1602AEX9E7qpTpH5NzW1535203788506compressflag.jpg'
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    html = requests.get(url, headers=headers).content
    
    # 把图片保存到本地
    with open('赵丽颖.jpg', 'wb') as f:
        f.write(html)

    百度贴吧图片抓取

    目标:抓取指定贴吧所有图片

    思路

    1. 获取贴吧主页URL,下一页,找到不同页的URL规律
    2. 获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...]
    3. 对每个帖子链接发请求,获取图片URL
    4. 向图片的URL发请求,以wb方式写入本地文件

    贴吧URL规律:http://tieba.baidu.com/f?kw=??&pn=50

    xpath表达式

    1、帖子链接xpath,这里为什么属性选择class,因为相同的元素他们要相同的样式

    //div[@class="t_con cleafix"]/div/div/div/a/@href

    2、图片链接xpath

    //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src

    3、视频链接xpath

    //div[@class="video_src_wrapper"]/embed/@data-video

    # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化

    百度贴吧视频抓取反爬机制(对响应内容做处理)

    网页源代码是:

    <div class="video_src_wrapper">
       <embed data-video="http://tb-video.bdstatic.com/tieba-smallvideo-transcode-cae/2754153_8fcd225842344de9901c1489e49defbe_0_cae.mp4"

    F12调试定位到的代码是:

    <div class="video_src_wrapper">
        <div class="video_src_wrap_main">
            <video src="http://tb-video.bdstatic.com/tie-cae/f2358e8_0_cae.mp4" "></video>
        </div>
    </div>

    如果通过F12定位的位置,写xpath,会爬取不到,因为我们requsets爬取的是网页代码,最后还是要以网页源代码为主。

    import requests
    from lxml import etree
    import random
    import time
    from urllib import parse
    
    
    class BaiduImageSpider(object):
        def __init__(self):
            self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
            self.ua_list = [
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET 
                CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]
    
        # 获取html网页代码
        def get_html(self, url):
            headers = {'User-Agent': random.choice(self.ua_list)}
            html = requests.get(url=url, headers=headers).content.decode('utf-8', 'ignore')
            return html
    
        # 解析html
        def xpath_func(self, html, xpath_bds):
            parse_html = etree.HTML(html)
            r_list = parse_html.xpath(xpath_bds)
            return r_list
    
        # 图片抓取
        def parse_html(self, one_url):
            html = self.get_html(one_url)
            xpath_bds = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
            r_list = self.xpath_func(html, xpath_bds)  # 提取帖子链接:xpath_list ['/p/32323','','']
            for r in r_list:
                t_url = 'http://tieba.baidu.com' + r  # 拼接帖子的URL地址
                self.get_image(t_url)  # 把帖子中所有图片保存到本地
                time.sleep(random.uniform(0, 2))  # 爬完1个帖子中所有图片,休眠0-2秒钟
    
        # 给定1个帖子URL,把帖子中所有图片保存到本地
        def get_image(self, t_url):
            html = self.get_html(t_url)
    
            # 使用xpath表达式的或| : 图片链接 + 视频链接
            xpath_bds = '//div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video'
            img_list = self.xpath_func(html, xpath_bds)  # ['http://xxx.jpg','']
            print(img_list)
            for img in img_list:
                html_bytes = requests.get(url=img, headers={'User-Agent': random.choice(self.ua_list)}).content
                self.save_img(html_bytes, img)
    
        # 保存图片函数
        def save_img(self, html_bytes, img):
            filename = img[-10:]
            with open(filename, 'wb') as f:
                f.write(html_bytes)
                print('%s下载成功' % filename)
    
        # 主函数
        def main(self):
            name = input('请输入贴吧名:')
            begin = int(input('请输入起始页:'))
            end = int(input('请输入终止页:'))
            # 对贴吧名进行编码
            kw = parse.quote(name)
            for page in range(begin, end + 1):
                pn = (page - 1) * 50
                url = self.url.format(kw, pn)
                # 调用主线函数
                self.parse_html(url)
    
    
    if __name__ == '__main__':
        spider = BaiduImageSpider()
        spider.main()

    查询参数-params

    res = requests.get(url,params=params,headers=headers)

    url为基准的url地址,不包含查询参数,该方法会自动对params字典编码,然后和url拼接

    参数类型:字典,字典中键值对作为查询参数

    import requests
    
    baseurl = 'http://tieba.baidu.com/f?'
    params = {
        'kw': '赵丽颖吧',
        'pn': '50'}
    headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2
                                                    ; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 
                                                    3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
    # 自动对params进行编码,然后自动和url进行拼接,去发请求
    res = requests.get(baseurl, headers=headers, params=params)
    res.encoding = 'utf-8'
    print(res.text)

    Web客户端验证参数-auth

    res = requests.get(url, headers=headers, auth=('username','password'))

    针对于需要web客户端用户名密码认证的网站,auth = ('username','password')

    达内课程笔记

    import requests
    from lxml import etree
    import random
    import os
    
    
    class CodeSpider(object):
        def __init__(self):
            self.url = 'http://code.tarena.com.cn/AIDCode/aid1904/14-redis/'
            self.auth = ('tarenacode', 'code_2013')
            self.ua_list = [
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .
                NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]
    
        def parse_html(self):
            # 获取响应内容
            html = requests.get(url=self.url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth)
            html = html.content.decode('utf-8', 'ignore')
    
            parse_html = etree.HTML(html)   # 解析
            r_list = parse_html.xpath('//a/@href')      # # r_list : ['../','day01','day02','redis_day01.zip']
            for r in r_list:
                if r.endswith('.zip') or r.endswith('.rar'):
                    self.save_files(r)
    
        def save_files(self, r):
            directory = '/home/tarena/AID/redis/'
            if not os.path.exists(directory):
                os.makedirs(directory)
    
            # 拼接地址,把zip文件保存到指定目录
            url = self.url + r
            # filename: /home/tarena/AID/redis/xxx.zip
            filename = directory + r
            html = requests.get(url=url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth).content
    
            with open(filename, 'wb') as f:
                f.write(html)
                print('%s下载成功' % r)
    
    
    if __name__ == '__main__':
        spider = CodeSpider()
        spider.parse_html()

    SSL证书认证参数-verify

    response = requests.get(url=url,params=params,headers=headers,verify=False)

      SSL证书认证参适用于没有经过 证书认证机构认证的https类型网站,一般这种网站会抛出 SSLError 异常则考虑使用此参数

    verify:True(默认)检查证书认证;False(常用)忽略证书认证

    代理参数-proxies

    定义:代替你原来的IP地址去对接网络的IP地址。隐藏自身真实IP,避免被封

    普通代理

    获取代理IP网站:西刺代理、快代理、全网代理、代理精灵、... ...

    语法结构

    proxies = {'协议':'协议://IP:端口号'}
    
    # http和https是相同的
    proxies = {
      'http':'http://59.172.27.6:38380',
      'https':'https://59.172.27.6:38380'
    }

    使用免费普通代理IP访问测试网站: http://httpbin.org/get

    import requests
    
    url = 'http://httpbin.org/get'
    headers = {'User-Agent': 'Mozilla/5.0'}
    # 定义代理,在代理IP网站中查找免费代理IP
    proxies = {
        'http': 'http://309435365:szayclhp@43.226.164.156:16818',
        'https': 'https://309435365:szayclhp@43.226.164.156:16818'}
    html = requests.get(url, proxies=proxies, headers=headers, timeout=5).text
    print(html)

    IP池

    从西刺代理上面爬取IP,迭代测试能否使用,建立一个自己的代理IP池,随时更新用来抓取网站数据

    import requests
    from lxml import etree
    import time
    import random
    from fake_useragent import UserAgent
    
    
    class GetProxyIP(object):
        def __init__(self):
            self.url = 'https://www.xicidaili.com/nn/'
            self.proxies = {
                'http': 'http://163.204.247.219:9999',
                'https': 'http://163.204.247.219:9999'}
    
        # 随机生成User-Agent
        def get_random_ua(self):
            ua = UserAgent()        # 创建User-Agent对象
            useragent = ua.random
            return useragent
    
        # 从西刺代理网站上获取随机的代理IP
        def get_ip_file(self, url):
            headers = {'User-Agent': self.get_random_ua()}
            # 访问西刺代理网站国内高匿代理,找到所有的tr节点对象
            html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode('utf-8', 'ignore')
            parse_html = etree.HTML(html)
            # 基准xpath,匹配每个代理IP的节点对象列表
            tr_list = parse_html.xpath('//tr')
            for tr in tr_list[1:]:
                ip = tr.xpath('./td[2]/text()')[0]
                port = tr.xpath('./td[3]/text()')[0]
                # 测试ip:port是否可用
                self.test_proxy_ip(ip, port)
    
        # 测试抓取的代理IP是否可用
        def test_proxy_ip(self, ip, port):
            proxies = {
                'http': 'http://{}:{}'.format(ip, port),
                'https': 'https://{}:{}'.format(ip, port), }
            test_url = 'http://www.baidu.com/'
            try:
                res = requests.get(url=test_url, proxies=proxies, timeout=8)
                if res.status_code == 200:
                    print(ip, ":", port, 'Success')
                    with open('proxies.txt', 'a') as f:
                        f.write(ip + ':' + port + '
    ')
            except Exception as e:
                print(ip, port, 'Failed')
    
        # 主函数
        def main(self):
            for i in range(1, 1001):
                url = self.url.format(i)
                self.get_ip_file(url)
                time.sleep(random.randint(5, 10))
    
    
    if __name__ == '__main__':
        spider = GetProxyIP()
        spider.main()

    从IP池中取IP

    从文件中随机获取代理IP写爬虫

    import random
    import requests
    
    
    class BaiduSpider(object):
        def __init__(self):
            self.url = 'http://www.baidu.com/'
            self.headers = {'User-Agent': 'Mozilla/5.0'}
            self.blag = 1
    
        def get_proxies(self):
            with open('proxies.txt', 'r') as f:
                result = f.readlines()  # 读取所有行并返回列表
            proxy_ip = random.choice(result)[:-1]       # 获取了所有代理IP
            L = proxy_ip.split(':')
            proxy_ip = {
                'http': 'http://{}:{}'.format(L[0], L[1]),
                'https': 'https://{}:{}'.format(L[0], L[1])
            }
            return proxy_ip
    
        def get_html(self):
            proxies = self.get_proxies()
            if self.blag <= 3:
                try:
                    html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text
                    print(html)
                except Exception as e:
                    print('Retry')
                    self.blag += 1
                    self.get_html()
    
    
    if __name__ == '__main__':
        spider = BaiduSpider()
        spider.get_html()

    收费代理API

    写一个获取收费开放API代理的接口

    # 获取开放代理的接口
    import requests
    from fake_useragent import UserAgent
    
    ua = UserAgent()  # 创建User-Agent对象
    useragent = ua.random
    headers = {'User-Agent': useragent}
    
    
    def ip_test(ip):
        url = 'http://www.baidu.com/'
        ip_port = ip.split(':')
        proxies = {
            'http': 'http://{}:{}'.format(ip_port[0], ip_port[1]),
            'https': 'https://{}:{}'.format(ip_port[0], ip_port[1]),
        }
        res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
        if res.status_code == 200:
            return True
        else:
            return False
    
    
    # 提取代理IP
    def get_ip_list():
        # 快代理:https://www.kuaidaili.com/doc/product/dps/
        api_url = 'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2'
        html = requests.get(api_url).content.decode('utf-8', 'ignore')
        ip_port_list = html.split('
    ')
    
        for ip in ip_port_list:
            with open('proxy_ip.txt', 'a') as f:
                if ip_test(ip):
                    f.write(ip + '
    ')
    
    
    if __name__ == '__main__':
        get_ip_list()

    私密代理

    1、语法结构

    proxies = {
    '协议':'协议://用户名:密码@IP:端口号'
    }
    proxies = {
        'http':'http://用户名:密码@IP:端口号',
        'https':'https://用户名:密码@IP:端口号'
    }
    proxies = {
        'http': 'http://309435365:szayclhp@106.75.71.140:16816',
        'https':'https://309435365:szayclhp@106.75.71.140:16816',
    } 

    用户名和密码会在给你API_URL的时候给你。不是你的账号和账号密码。

    # 获取开放代理的接口
    import requests
    from fake_useragent import UserAgent
    
    ua = UserAgent()  # 创建User-Agent对象
    useragent = ua.random
    headers = {'User-Agent': useragent}
    
    
    def ip_test(ip):
        url = 'https://blog.csdn.net/qq_34218078/article/details/90901602/'
        ip_port = ip.split(':')
        proxies = {
            'http': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
            'https': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
        }
    
        res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
        if res.status_code == 200:
            print("OK")
            return True
        else:
            print(res.status_code)
            print("错误")
            return False
    
    
    # 提取代理IP
    def get_ip_list():
        # 快代理:https://www.kuaidaili.com/doc/product/dps/
        api_url = 'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2'
        html = requests.get(api_url).content.decode('utf-8', 'ignore')
        ip_port_list = html.split('
    ')
    
        for ip in ip_port_list:
            with open('proxy_ip.txt', 'a') as f:
                if ip_test(ip):
                    f.write(ip + '
    ')
    
    
    if __name__ == '__main__':
        get_ip_list()

     

  • 相关阅读:
    Unity3D移动端海水的实时绘制
    NGUI 3.x 深度管理及渲染优化
    【入门】从学生到成熟:游戏模块设计起步之抽象思维 (转)
    正弦波近似 http://blog.csdn.net/ring0hx/article/details/44492415
    Stack 栈 ----Queue 队列
    ORM
    CBV&FBV
    Django路由系统
    CRM
    深浅拷贝
  • 原文地址:https://www.cnblogs.com/LXP-Never/p/11361393.html
Copyright © 2020-2023  润新知