• 爬虫中什么是requests


    print(response.text)       #响应的信息
    print(response.headers)  #获取响应头
    print(response.status_code)  #响应状态码
    print(response.encoding)   #响应的编码
    print(response.cookies)   #获取cookies信息
    

    带参数GET请求

    data = {
        'name':'abc',
    ''''''
    }
    
    response = requests.get(url='http://www.baidu.com',params=data)
    

    解析json

    import requests
    
    response = requests.get(url='http://www.baidu.com')
    print(response.json())
    

    获取二进制数据

    import requests
    
    response = requests.get(url='http://www.baidu.com')
    print(response.content)
    

    高级操作

    文件上传

    import requests
    flies = {
        'flies':open('XXX','rb')
    }
    response = requests.post(url='http://www.baidu.com',flies=flies)
    print(response.content)
    
    会话维持 (模拟登陆)
    import requests
    
    s = requests.Session()
    s.get('http://httpbin.org/cookies/set/number/123456789')
    response = s.get('http://httpbin.org/cookies')
    print(response.text)
    
    {
      "cookies": {
        "number": "123456789"
      }
    }
    

    证书验证

    import requests
    import urllib3
    
    url = 'https://www.biqudu.com/43_43821/2520338.html'
    urllib3.disable_warnings() #关闭证书后再把警告提示关闭
    response = requests.get(url=url,verify=False)
    print(response.text)
    

    代理认证

    url = 'https://www.biqudu.com/43_43821/2520338.html'
    proxies = {
        'http':'http://127.0.0.2',   
        'https':'http://user:pwd@127.0.0.2',  #带密码的代理
    }
    
    response = requests.get(url=url,proxies=proxies)
    print(response.text)
        ****
    

    请求超时处理

    import requests
    from requests.exceptions import ReadTimeout  #导入错误模块
    
    url = 'https://www.taobao.com'
    try:
        response = requests.get(url=url,timeout=0.1)  #限制请求时间
        print(response.status_code)
    except ReadTimeout:
        print('请求超时')
    

    认证设置

    #有的网站打开的瞬间就需要密码认证
    
    import requests
    from requests.auth import HTTPBasicAuth
    
    url = 'https://www.taobao.com'
    
    response = requests.get(url=url,auth=('user','pwd'))
    print(response.status_code)
    

    1,笔趣阁小说(入门级爬取文本信息)

    抓取笔趣阁小说:排行榜单的小说总榜
    
    1.请求初始url,获取网页源码
    2.解析网页源码,得到文本内容
    3.将小说全部章节名存入txt文件中
    
    from lxml import etree
    import requests
    
    url = 'http://www.biqiuge.com/paihangbang'
    
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    
    
    html = etree.HTML(response.text)
    info = html.xpath("//div[@class='block bd'][1]/ul[@class='tli']/li/a")
    for i in info:
        title = i.xpath("./text()")[0]
        urls =i.xpath("./@href")[0]
        urls1 = 'http://www.biqiuge.com'+urls
    
        with open(title+'.txt','w+',encoding='utf-8') as f:
            response1 = requests.get(url=urls1)
            response1.encoding = response1.apparent_encoding
            html = etree.HTML(response1.text)
            info = html.xpath("//div[@class='listmain']/dl/dd/a/text()")[6:]
            for i in info:
                f.write(i.strip()+'
    ')
            print(title+"------写入成功")
    
    ------------------------------------------------------
    判断路径是否存在,自动创建!!!
    if not os.path.exists(title):
        os.mkdir(title)
    
    path = os.path.join(title,title1)
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    with open(path+ '\' + title2 +'.txt', 'w+', encoding='utf-8') as f:
        for con in contents:
            f.write(con.strip() + '
    ')
        print(title +'---'+ title1 +'---'+ title2 + '---写入成功')
    

    2,崔庆才博客(伪造头信息爬取策略)

    from lxml import etree
    
    import requests
    n = 0
    with open('cuijincai.txt', 'w+', encoding='utf-8') as f:
        for i in range(1,10):
            url = 'https://cuiqingcai.com/category/technique/python/page/'+str(i)
    #这里的循环,该网站是动态显示,可以在f12/network中XHR中查到该链接url。
            headers = {
    
            Referer: https://cuiqingcai.com/category/technique/python
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
            }
        #部分网站设置反爬机制,可以为请求头设置 信息
            response = requests.get(url=url,headers=headers)
            html = etree.HTML(response.text)
            all_div = html.xpath("//article[@class='excerpt']")
    
            for div in  all_div:
                title = div.xpath("./header/h2/a/text()")[0]  #当前路径下的标题信息
                author = div.xpath("./p[@class='auth-span']/span[@class='muted'][1]/a/text()")[0]
                time = div.xpath("./p[@class='auth-span']/span[@class='muted'][2]/text()")[0]
                liulanshu = div.xpath("./p[@class='auth-span']/span[@class='muted'][3]/text()")[0]
                pinlun = div.xpath("./p[@class='auth-span']/span[@class='muted'][4]/a/text()")[0]
                like = div.xpath("./p[@class='auth-span']/span[@class='muted'][5]/a[@id='Addlike']/span[@class='count']/text()")[0]+'喜欢'
                n += 1
                f.write("第{}条	{}	{}	{}	{}	{}	{}
    ".format(n,title,author,time,liulanshu,pinlun,like))
    
    User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统及版本、CPU 类型、浏览器及版本、
    浏览器渲染引擎、浏览器语言、浏览器插件等。
    
    HTTP Referer是header的一部分,当浏览器向web服务器发送请求的时候,一般会带上Referer,告诉服务器我是从哪个页面链接过来的,
    服务器基此可以获得一些信息用于处理。
    
    https://www.liaoxuefeng.com  该网站设置反爬,可以用上面设置头信息爬取
  • 相关阅读:
    #sort 快速排序 20. 9.14
    #Trie Trie树调试模板 20.09.21
    #operator ——“Kruskal算法求最小生成树 中的 operator” ~20.8.17
    #STL #List 容器函数以及一些内置函数的用法
    刷题周记(三)——#最小生成树:Kruskal#二分图:染色法、匈牙利算法#拓扑#DFS:排列数字、n-皇后#BFS:走迷宫、八格码#List容器
    #周测 9 —— 高低位交换、Subsequences Summing to Sevens S、积木大赛、跳石头
    刷题周记(二)——KMP,Trie,最大异或对,(并查集)合并集合、连通块中点的数量、食物链,堆排序,单多源最短路、Dijkstra、bellman-ford、SPFA、Floyd、(堆优化)Prim
    4.SQL(数据库变更)
    3.SQL(查询)
    2.Oracle基本使用
  • 原文地址:https://www.cnblogs.com/gaosai/p/9825067.html
Copyright © 2020-2023  润新知