• 爬虫 -requests


    requests模块

    - 1.什么是requests模块
        - python原生一个基于网络请求的模块,模拟浏览器发起请求。
    - 2.为什么要使用requests模块
        - 1.自动处理url编码 
        - 2.自动处理post请求的参数
        - 3.简化cookie的代理的操作:
            cookie操作:
            - 创建一个cookiejar对象
            - 创建一个handler对象
            - 创建一个operner
            
            代理操作:
            - 创建handler对象,代理ip和端口封装到该对象
            - 创建openner对象
    - 3.requests如何被使用
        - 安装:pip install  requests
        - 使用流程:
            - 1.指定url
            - 2.使用requests模块发起请求
            - 3.获取响应数据
            - 4.进行持久化存储
    - 4.通过5个基于requests模块的爬虫项目对该模块进行系统学习和巩固
        - get请求
        - post请求
        - ajax的get
        - ajax的post
        - 综合
        - cookie
        - 代理
    

    1 基于requests模块发起get请求

    - 需求:爬取搜狗首页的页面数据
    
    import requests
    #指定url
    url = 'https://www.sogou.com/'
    
    #发起get请求:get方法会返回请求成功的相应对象
    response = requests.get(url=url)
    
    #获取响应中的数据值:text可以获取响应对象中字符串形式的页面数据
    page_data = response.text
    
    print(page_data)
    
    #持久化操作
    with open('./sougou.html','w',encoding='utf-8') as fp:
        fp.write(page_data)
    

    response对象中其他重要的属性

    #response对象中其他重要的属性
    import requests
    #指定url
    url = 'https://www.sogou.com/'
    
    #发起get请求:get方法会返回请求成功的相应对象
    response = requests.get(url=url)
    
    #content获取的是response对象中二进制(byte)类型的页面数据
    # print(response.content)
    #返回一个响应状态码
    # print(response.status_code)
    #返回响应头信息
    # print(response.headers)
    #获取请求的url
    # print(response.url)
    
    https://www.sogou.com/
    
    • requests模块如何处理携带参数的get请求
      方式1:
      • 需求:指定一个词条,获取搜狗搜索结果所对应的页面数据
    import requests
    
    url = 'https://www.sogou.com/web?query=周杰伦&ie=utf-8'
    
    response = requests.get(url=url)
    
    page_text = response.text
    
    with open('./zhou.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    # 自定义请求头信息
    import requests
    url = 'https://www.sogou.com/web'
    
    #将参数封装到字典中
    params = {
        'query':'周杰伦',
        'ie':'utf-8'
    }
    #自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    
    response = requests.get(url=url,params=params,headers=headers)
    
    response.status_code
    
    
    200
    

    2 基于requests模块发起的post请求

    - 需求:登录豆瓣网,获取登录成功后的页面数据
    
    import requests
    
    #1.指定post请求的url
    url = 'https://accounts.douban.com/login'
    
    #封装post请求的参数
    data = {
        "source": "movie",
        "redir": "https://movie.douban.com/",
        "form_email": "15027900535",
        "form_password": "bobo@15027900535",
        "login": "登录",
    }
    #自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    #2.发起post请求
    response = requests.post(url=url,data=data,headers=headers)
    
    #3.获取响应对象中的页面数据
    page_text = response.text
    
    #4.持久化操作
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    基于ajax的get请求

    -需求:抓取豆瓣电影上电影详情的数据

    import requests
    
    url = 'https://movie.douban.com/j/chart/top_list?'
    
    #封装ajax的get请求中携带的参数
    params = {
        'type':'5',
        'interval_id':'100:90',
        'action':'',
        'start':'200',
        'limit':'20'
    }
    #自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    
    response = requests.get(url=url,params=params,headers=headers)
    
    # print(response.text)
    
    基于ajax的post请求
    - 需求:爬去肯德基城市餐厅位置数据
    
    import requests
    
    #1指定url
    post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    
    #处理post请求的参数
    data = {
        "cname": "",
        "pid": "",
        "keyword": "上海",
        "pageIndex": "1",
        "pageSize": "10",
    }
    #自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    
    #2发起基于ajax的post请求
    response = requests.post(url=post_url,headers=headers,data=data)
    
    response.text
    
    '{"Table":[{"rowcount":28}],"Table1":[{"rownum":1,"storeName":"开发区上海路","addressDetail":"开发区上海路80号乐天玛特超一楼","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"江苏省","cityName":"南通市"},{"rownum":2,"storeName":"太仓新区东亭路","addressDetail":"太仓市娄江路与县府街交叉口西南侧(太仓新区上海路北、东亭路东地块)万达广场","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"江苏省","cityName":"苏州市"},{"rownum":3,"storeName":"动力南广场(汇金奥特莱斯B1层)","addressDetail":"石龙路750-3号上海南站地下商场南馆","pro":"精选店,礼品卡","provinceName":"上海市","cityName":"上海市"},{"rownum":4,"storeName":"谷阳","addressDetail":"松汇中路568号上海鹿都商业广场","pro":"Wi-Fi,店内参观,礼品卡,生日餐会,溯源","provinceName":"上海市","cityName":"上海市"},{"rownum":5,"storeName":"南广场","addressDetail":"秣陵路303号上海火车站西南出口一侧","pro":"Wi-Fi,礼品卡","provinceName":"上海市","cityName":"上海市"},{"rownum":6,"storeName":"北广场","addressDetail":"秣陵路303号上海站北广场北立面1F大厅入口西侧位置","pro":"Wi-Fi","provinceName":"上海市","cityName":"上海市"},{"rownum":7,"storeName":"南站精选(火车站站内)","addressDetail":"沪闵路9001号上海南站候车大厅","pro":"精选店,礼品卡","provinceName":"上海市","cityName":"上海市"},{"rownum":8,"storeName":"上海滩","addressDetail":"佘山林湖路888号-1上海滩区5号","pro":"精选店,礼品卡","provinceName":"上海市","cityName":"上海市"},{"rownum":9,"storeName":"动力南精选(3号线2号出口)","addressDetail":"沪闵路9001号上海南站(1F三角地A-1)一层","pro":"精选店,礼品卡","provinceName":"上海市","cityName":"上海市"},{"rownum":10,"storeName":"上海南路","addressDetail":"上海南路3号699生活空间3号楼","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"江西省","cityName":"南昌市"}]}'
    

    3 综合项目实战

    - 需求:爬取搜狗知乎某一个词条对应一定范围页码表示的页面数据
    
    import os
    
    #前三页页面数据(1,2,3)
    import requests
    import os
    
    #创建一个文件夹
    if not os.path.exists('./pages'):
        os.mkdir('./pages')
        
    word = input('enter a word:')
    
    #动态指定页码的范围
    start_pageNum = int(input('enter a start pageNum:'))
    end_pageNum = int(input('enter a end pageNum:'))
    #自定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    #1.指定url:设计成一个具有通用的url
    url = 'https://zhihu.sogou.com/zhihu'
    for page in range(start_pageNum,end_pageNum+1):
        param = {
            'query':word,
            'page':page,
            'ie':'utf-8'
        }
        response = requests.get(url=url,params=param,headers=headers)
        
        #获取响应中的页面数据(指定页码(page))
        page_text = response.text
        
        #进行持久化存储
        fileName = word+str(page)+'.html'
        filePath = 'pages/'+fileName
        with open(filePath,'w',encoding='utf-8') as fp:
            fp.write(page_text)
            print('第%d页数据写入成功'%page)
        
        
    
    enter a word:python
    enter a start pageNum:3
    enter a end pageNum:5
    第3页数据写入成功
    第4页数据写入成功
    第5页数据写入成功
    

    4 cookie:

    - cookie:
        基于用户的用户数据
        - 需求:爬取张三用户的豆瓣网的个人主页页面数据
    - cookie作用:服务器端使用cookie来记录客户端的状态信息。
    实现流程:
        1.执行登录操作(获取cookie)
        2.在发起个人主页请求时,需要将cookie携带到该请求中
        注意:session对象:发送请求(会将cookie对象进行自动存储)
    - 代理:
    
    #问题:没有获取个人主页的页面数据
    #原因:爬虫程序没有严格遵从浏览器的请求流程
    import requests
    
    #1.指定url
    url = 'https://www.douban.com/people/185687620/'
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
    #2.发起请求
    response = requests.get(url=url,headers=headers)
    
    #3.获取页面数据
    page_text = response.text
    
    #4.持久化存储
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    import requests
    
    session = requests.session()
    #1.发起登录请求:将cookie获取,切存储到session对象中
    login_url = 'https://accounts.douban.com/login'
    data = {
        "source": "None",
        "redir": "https://www.douban.com/people/185687620/",
        "form_email": "15027900535",
        "form_password": "bobo@15027900535",
        "login": "登录",
    }
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
    #使用session发起post请求
    login_response = session.post(url=login_url,data=data,headers=headers)
    
    #2.对个人主页发起请求(session(cookie)),获取响应页面数据
    url = 'https://www.douban.com/people/185687620/'
    response = session.get(url=url,headers=headers)
    page_text = response.text
    
    with open('./douban110.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    

    5 代理操作:

    - 1.代理:第三方代理本体执行相关的事物。生活:代购,微商,中介
    - 2.为什么要使用代理?
        - 反爬操作。
        - 反反爬手段
    - 3.分类:
        - 正向代理:代替客户端获取数据
        - 反向代理:代理服务器端提供数据
    - 4.免费代理ip的网站提供商:
        - www.goubanjia.com
        - 快代理
        - 西祠代理
    - 5.代码:
    
    import requests
    
    url = 'http://www.baidu.com/s?ie=utf-8&wd=ip'
    
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
    
    #将代理ip封装到字典
    proxy = { 
        'http':'77.73.69.120:3128'
    }
    #更换网路IP
    response = requests.get(url=url,proxies=proxy,headers=headers)
    
    with open('./daili.html','w',encoding='utf-8') as fp:
        fp.write(response.text)
        
        
    
    作者:华王 博客:https://www.cnblogs.com/huahuawang/
  • 相关阅读:
    树莓派安装parrot linux记录
    Arch linux(UEFI+GPT)安装及后续优化教程
    VS部分安全函数用法
    C语言博客作业06--结构体&文件
    C语言博客作业05--指针
    C语言博客作业04--数组
    C语言博客作业03--函数
    C语言博客作业02--循环结构
    DS博客作业08--课程总结
    DS博客作业07--查找
  • 原文地址:https://www.cnblogs.com/huahuawang/p/14888490.html
Copyright © 2020-2023  润新知