• python爬虫(2)


    python内置的爬虫模块

    • urllib
    • requests

    定义

    requests模块:python中原生的一款基于网络请求的模块,功能非常强大,简单便捷,效率极高。

    如何使用

    1. 指定url
      • UA伪装
      • 请求参数处理
    2. 发起请求
    3. 获取响应数据
    4. 持久化存储

    安装

    pip install requests
    

    实战

    • 需求: 爬取搜狗首页的页面数据
    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    #- 需求:爬取搜狗首页的页面数据
    import requests
    if __name__ == "__main__":
        #step_1:指定url
        url = 'https://www.sogou.com/'
        #step_2:发起请求
        #get方法会返回一个响应对象
        response = requests.get(url=url)
        #step_3:获取响应数据.text返回的是字符串形式的响应数据
        page_text = response.text
        print(page_text)
        #step_4:持久化存储
        with open('./sogou.html','w',encoding='utf-8') as fp:
            fp.write(page_text)
        print('爬取数据结束!!!')
    
    • 需求: 破解百度翻译
    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    import requests
    import json
    if __name__ == "__main__":
        #1.指定url
        post_url = 'https://fanyi.baidu.com/sug'
        #2.进行UA伪装
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    
        }
        #3.post请求参数处理(同get请求一致)
        word = input('enter a word:')
        data = {
            'kw':word
        }
        #4.请求发送
        response = requests.post(url=post_url,data=data,headers=headers)
        #5.获取响应数据:json()方法返回的是obj(如果确认响应数据是json类型的,才可以使用json())
        dic_obj = response.json()
    
        #持久化存储
        fileName = word+'.json'
        fp = open(fileName,'w',encoding='utf-8')
        json.dump(dic_obj,fp=fp,ensure_ascii=False)
    
        print('over!!!')
    
    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    import requests
    import json
    if __name__ == "__main__":
        url = 'https://movie.douban.com/j/chart/top_list'
        param = {
            'type': '24',
            'interval_id': '100:90',
            'action':'',
            'start': '0',#从库中的第几部电影去取
            'limit': '20',#一次取出的个数
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    
        }
        response = requests.get(url=url,params=param,headers=headers)
    
        list_data = response.json()
    
        fp = open('./douban.json','w',encoding='utf-8')
        json.dump(list_data,fp=fp,ensure_ascii=False)
        print('over!!!')
    
    • 需求: 爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
    import requests, json
    
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
    
    data = {
        "cname": "",
        "pid": "",
        "keyword": "深圳",
        "pageIndex": 1,
        "pageSize": 10
    }
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
    }
    
    response = requests.post(url=url, data=data, headers=headers)
    content = response.json()
    with open('KFC.json', 'w', encoding='utf-8') as f:
        json.dump(content, f, ensure_ascii=False, indent=2)
    
    • 需求: 爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据
    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    import  requests
    import json
    if __name__ == "__main__":
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    
        }
        id_list = []  # 存储企业的id
        all_data_list = []  # 存储所有的企业详情数据
        #批量获取不同企业的id值
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
        #参数的封装
        for page in range(1,6):
            page = str(page)
            data = {
                'on': 'true',
                'page': page,
                'pageSize': '15',
                'productName':'',
                'conditionType': '1',
                'applyname':'',
                'applysn':'',
            }
            json_ids = requests.post(url=url,headers=headers,data=data).json()
            for dic in json_ids['list']:
                id_list.append(dic['ID'])
    
        #获取企业详情数据
        post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
        for id in id_list:
            data = {
                'id':id
            }
            detail_json = requests.post(url=post_url,headers=headers,data=data).json()
            # print(detail_json,'-------------ending-----------')
            all_data_list.append(detail_json)
    
        #持久化存储all_data_list
        fp = open('./allData.json','w',encoding='utf-8')
        json.dump(all_data_list,fp=fp,ensure_ascii=False)
        print('over!!!')
    
  • 相关阅读:
    Java异常
    docker安装和介绍(基于centos 7)
    centos7搭建gitlab版本控制系统
    webstorm快捷键使用
    Extjs-note
    mak iso
    windows上的硬盘挂载到linux上的步骤方法
    【JAVA笔记——术】JSP中乱码问题的解决方法
    PHP获得IP方式
    ubuntu安装hadoop 若干问题的解决
  • 原文地址:https://www.cnblogs.com/drfung/p/11835977.html
Copyright © 2020-2023  润新知