• 爬取豌豆荚


    '''
    爬取豌豆荚APP数据
    url:https://www.wandoujia.com/category/6001
    data:
    名称、详情页、下载人数、APP大小
    app_name
    detail_url
    download_num
    app_size
    <a href="" title="" class="">(.*?)</a>.*?<span class="">(.*?)</span>.*?<span title="">(.*?)</span><div class="comment"> (.*?)</div>
    '''
    import requests
    import re


    # 1.发送请求
    def get_page(url):
    response=requests.get(url)
    return response


    def parse_index(html):
    movie_list=re.findall('<h2 class="app-title-h2"><a href="(.*?)" title="(.*?)" class="name">.*?</a>.*?<span class="install-count">(.*?)万人安装</span> <span class="dot">・</span> <span title="(.*?)">.*?MB</span>',
    html,
    re.S)
    return movie_list

    # 保持数据
    def save_data(movie):
    detail_url,app_name,download_num,app_size=movie
    data=f'''
    游戏名称:{app_name}
    详情页url:{detail_url}
    下载人数:{download_num}万人
    APP大小:{app_size}


    '''
    print(data)
    with open('wandoujia.text','a',encoding='utf-8')as f:
    f.write(data)

    # print("写入成功!")

    if __name__ == '__main__':
    url=f'https://www.wandoujia.com/category/6001'
    print(url)

    index_res=get_page(url)

    movie_list=parse_index(index_res.text)

    for movie in movie_list:
    save_data(movie)


  • 相关阅读:
    使用Docker在本地搭建Hadoop分布式集群
    微博推荐 第三个map 源码
    对象
    http无状态(stateless)
    理解http的无连接
    http响应报文之首部行
    http响应报文之状态行
    http响应报文
    http请求报文之首部行
    http请求之请求数据
  • 原文地址:https://www.cnblogs.com/2328322824chx/p/11129387.html
Copyright © 2020-2023  润新知