• 项目练习:电影列表爬虫


     1 # -*- coding:utf-8 -*-
     2 # Author:Sure Feng
     3 
     4 import requests
     5 import json
     6 
     7 
     8 class DoubanFileSpider(object):
     9     def __init__(self):
    10         self.start_tempt_url = [
    11             {
    12             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=recommend&page_limit=20&page_start={}",
    13             "country": "cn"
    14             }, {
    15             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%97%A5%E6%9C%AC&sort=recommend&page_limit=20&page_start={}",
    16             "country": "janpan"
    17             }, {
    18             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%AC%A7%E7%BE%8E&sort=recommend&page_limit=20&page_start={}",
    19             "country": "usa"
    20             }
    21         ]
    22         self.headers = {
    23             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36"}
    24 
    25     def parse_url(self, url):
    26         """发送请求,获取响应"""
    27         respond = requests.get(url, headers=self.headers)
    28         return respond.content.decode()
    29 
    30     def save_data(self, list_str, country):
    31         """保存数据"""
    32         with open("douban.txt", "a", encoding="utf-8") as f:
    33             for info_str in list_str:
    34                 # print(info_str)
    35                 info_str["country"] = country
    36                 f.write(json.dumps(info_str, ensure_ascii=False))
    37                 f.write("
    ")
    38 
    39     def get_content(self, json_str):
    40         """提取数据"""
    41         dict_ret = json.loads(json_str)
    42         content_list = dict_ret["subjects"]
    43         return content_list
    44 
    45     def run(self):
    46         """实现主要逻辑"""
    47         for url_tempt in self.start_tempt_url:
    48             num = 0
    49             country = url_tempt["country"]
    50             while True:
    51                 # start_url
    52                 start_url = url_tempt["url_tempt"].format(num)
    53                 print(start_url)
    54                 # 发送请求,获取响应
    55                 json_str = self.parse_url(start_url)
    56                 # 提取数据
    57                 content_list = self.get_content(json_str)
    58                 # 保存
    59                 self.save_data(content_list, country)
    60                 # 构造下一页的URL地址,重复步骤
    61                 if len(content_list) < 20:
    62                     break
    63                 num += 20
    64 
    65 
    66 if __name__ == "__main__":
    67     douban_spider = DoubanFileSpider()
    68     douban_spider.run()
  • 相关阅读:
    express 的安全中间件 helmet 简介
    一个设置过期时间的方案
    vscode 中的 vue 格式化
    linux 中部署不同版本 node.js 并同时使用的方案
    webpack 多页面模式配置
    1.assert
    我是一个线程(转)
    Android FragmentTransactionExtended:使Fragment以多种样式动画切换
    Android ORM应用开发框架KJFrameForAndroid使用详解
    Android Studio插件推荐(PreIOC,GsonFormat)
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10052871.html
Copyright © 2020-2023  润新知