• 使用pyquery爬取豆瓣电影top250,存储在mongodb


    直接上代码

     1 from pyquery import PyQuery as pq
     2 from requests import request
     3 import re
     4 import pymongo
     5 
     6 
     7 class SpiderDouBan2:
     8 
     9 
    10     def __init__(self):
    11         client = pymongo.MongoClient(host='localhost', port=27017)
    12         db = client['spider_db']
    13         self.collection = db['douban_movie_top250_2']
    14 
    15 
    16 
    17     def get_doc(self, url):
    18         '''
    19         获取某一页的doc
    20         :param url: 地址
    21         :return:
    22         '''
    23         doc = pq(url)
    24         return doc
    25 
    26 
    27     def get_one_page(self, doc, order):
    28         '''
    29         获取某一页的内容
    30         :return:
    31         '''
    32         movie_names = [name.text() for name in doc('.title').items() if not re.search('/', name.text())]
    33         movie_actors = [re.sub('xa0|" "', '', actor.text()).split('
    ') for actor in doc('.info .bd')('p:first-child').items()]
    34         movie_rates = [rate.text() for rate in doc('.rating_num').items()]
    35         comment_nums = [comment_num.text() for comment_num in doc('.star')('span:last-child').items()]
    36         short_comments = [short_comment.text()[:-1] for short_comment in doc('.inq').items()]
    37         for index, name in enumerate(movie_names):
    38             print(f'正在爬取第{order + index + 1}条数据...')
    39             movie_info = {
    40                         'order': f'No.{order + index + 1}',
    41                         'movie_name': name,
    42                         'movie_actor': movie_actors[index][0].rstrip('/'),
    43                         'movie_type': movie_actors[index][1],
    44                         'movie_rate': f'{movie_rates[index]}分',
    45                         'comment_num': comment_nums[index],
    46                         'short_comment': short_comments[index]
    47                         }
    48             self.collection.insert_one(movie_info)
    49 
    50 
    51 
    52     def main(self, url, order):
    53         '''
    54         主程序
    55         :return:
    56         '''
    57         doc = self.get_doc(url)
    58         self.get_one_page(doc, order)
    59 
    60 
    61 if __name__ == '__main__':
    62     for offset in range(0, 250, 25):
    63         order = offset
    64         url = f'https://movie.douban.com/top250?start={str(offset)}'
    65         SpiderDouBan2().main(url, order)

    运行结果

     

  • 相关阅读:
    Prony算法
    基于WeifenLuo.WinFormsUI.Docking界面布局控件的Winform框架
    C# 在父容器中显示子窗体
    如何识别高级的验证码
    我 .北漂的 80后男孩
    c# 主机和网络字节序的转换
    电网割集能量算法
    项目管理心得:一个项目经理的个人体会、经验总结
    Qt 登陆界面实现
    [ lucene FAQ ] 如何避免lucene queryparser中文分词的缺陷?[转]
  • 原文地址:https://www.cnblogs.com/my_captain/p/11073763.html
Copyright © 2020-2023  润新知