• 爬虫之requests模块


    安装requests模块

    pip install requests

    requests模块的常用方法, 属性, 参数

    import requests
    
    ret = requests.get(url='https://www.baidu.com', )
    # 原函数: get(url, params=None, **kwargs)
    ret.encoding = 'utf-8'  # 指定解析数据是使用的编码格式
    print(ret.content)  # 响应的数据, bytes类型
    print(ret.text)  # 响应的数据, str类型
    print(ret.url)  # 当前访问的url
    print(ret.headers, type(ret.headers))  # 响应头, 类型<class 'requests.structures.CaseInsensitiveDict'>, 和字典操作类似
    print(ret.json())  # 当响应的Content-Type为json时, 可以使用这个方法取json的数据
    
    
    params = {  # get请求URL中携带的请求的参数
        "keyword": "O98K",
    }
    header = {  # 请求头信息
        "name": "SATH"
    }
    ret = requests.get(url='http://www.baidu.com', params=params, header=header)
    data = {  # POST请求携带的参数
       "name": "sath"
    }

    爬虫案例一: 爬取搜狗指定词条搜索后的页面数据

    import requests
    
    url = "https://www.sogou.com/web"
    params = {
        "query": "apple"
    }
    # 根据对搜狗的请求分析, 发现提交搜索关键字的是https://www.sogou.com/web
    # 并且是以get方式发送的请求
    # 关键字是query
    
    ret = requests.get(url=url, params=params)
    with open('./sogou.html', 'w', encoding='utf-8') as f:
        f.write(ret.text)

    爬虫案例二: 爬取豆瓣电影分类排行榜中的电影详情数据

    import requests
    from multiprocessing import Pool
    import time
    
    url = 'https://movie.douban.com/j/new_search_subjects'
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    movie_title_list = []
    
    
    def get_movie(start):
        params = {
            "sort": "U",
            "tags": "",
            "start": start,
            "genres": "喜剧",
        }
        ret = requests.get(url=url, params=params, headers=header)
        if ret.headers['Content-Type'] == "application/json; charset=utf-8":
            data = ret.json()["data"]
            for movie in data:
                movie_title_list.append(movie["title"])
                print(movie["title"])
    
    
    if __name__ == '__main__':
        p = Pool(20)
        start = time.time()
        for n in range(0, 10000, 20):
            a = p.apply_async(get_movie, args=(n,))
        p.close()
        p.join()
        print(time.time() - start)
        # 14s, 还可以。。。。

    爬虫案例三: 爬取肯德基餐厅查询中指定地点的餐厅数据  

    import requests
    import json
    
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    data = {
        "cname": "",
        "pid": "",
        "keyword": "邯郸",
        "pageIndex": "1",
        "pageSize": "10",
    }
    ret = requests.post(url=url, headers=header, data=data, params={"op": "keyword"})
    res = json.loads(ret.text)
    print(res, type(res))

    爬虫案例四: 药监局信息爬取

    import requests
    from multiprocessing import Pool
    
    url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    ids = []
    for page in range(20, 250):
        data = {
            "on": "true",
            "page": page,
            "pageSize": "15",
            "productName": "",
            "conditionType": "1",
            "applyname": "",
            "applysn": "",
        }
        ret = requests.post(url=url, headers=header, data=data)
        if ret.headers['Content-Type'] == "application/json;charset=UTF-8":
            res = ret.json()["list"]
            for n in res:
                ids.append(n['ID'])
        else:
            pass
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    
    
    def func(k):
        data = {
            "id": k
        }
        r = requests.post(url=url, headers=header, data=data)
        if r.headers['Content-Type'] == "application/json;charset=UTF-8":
            print(r.json()["businessPerson"])
    
    
    if __name__ == '__main__':
        p = Pool(14)
        for k in ids:
            s = p.apply_async(func, k)
        p.close()
        p.join()
  • 相关阅读:
    hibernate悲观锁和乐观锁 Mr
    windows窗体调整
    我是一只草泥马
    草泥马2号
    用友二次开发 用友控件 Js宿主脚本 调用用友T6 登录 参照 控件示例
    KRBTabControl 不能不能输入
    藕のC#语法と
    ぇ份の测试ょ
    初学者编程入门:C语言指针使用方法
    开始写点东西了。。。。
  • 原文地址:https://www.cnblogs.com/594504110python/p/10066128.html
Copyright © 2020-2023  润新知