• 爬取豆瓣电影,把电影名称和详情url保存到json中


    # -*-coding:utf-8-*-
    import requests
    import json

    class Douban(object):
    def __init__(self):
    self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?&start=0&count=100"
    self.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'
    }

    def get_data(self):
    response = requests.get(self.url, headers=self.headers)
    # print(response.content.decode())
    return response.content.decode('UTF-8')

    def parse_data(self, data):
    # print(type(data))
    # str转化为dict
    dict_data = json.loads(data)
    # 使用key获取值 movie_li是一个元素为字典的列表
    movie_list = dict_data['subject_collection_items']
    # 把提取的内容放到新的元素为字典(key电影名称,)的列表中
    data_list = []
    for movie_info in movie_list:
    temp_dict = {}
    temp_dict['title'] = movie_info['title']
    temp_dict['url'] = movie_info['url']
    data_list.append(temp_dict)
    print(temp_dict['title'])
    # chardet.detect(data_list[0]['title'])
    return data_list


    def save_data(self, data_list):
    with open('douban_movie.json','w') as f:
    for data_info in data_list:
    str_data = json.dumps(data_info, ensure_ascii=False) + ', '
    f.write(str_data)

    def run(self):
    # 发起请求
    data = self.get_data()
    # 解析数据
    data_list = self.parse_data(data)
    # 保存数据
    self.save_data(data_list)
    # print(data)


    if __name__ == '__main__':
    douban = Douban()
    douban.run()
  • 相关阅读:
    chapter 12_1 数据文件
    chapter11_3 字符串缓冲
    chapter11_2 Lua链表与队列
    chapter11_1 Lua数组、列表
    chapter9_4 非抢占式的多线程
    Java设计模式
    java内存回收机制
    javaIO流概述
    java集合概述
    java多线程
  • 原文地址:https://www.cnblogs.com/jianxiaoguo/p/7635656.html
Copyright © 2020-2023  润新知