• Scrapy爬虫案例 | 数据存储至MongoDB


     

    豆瓣电影TOP 250网址

    要求:

    1.爬取豆瓣top 250电影名字、演员列表、评分和简介

    2.设置随机UserAgent和Proxy

    3.爬取到的数据保存到MongoDB数据库

     items.py

    # -*- coding: utf-8 -*-
    
    import scrapy
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        # 标题
        title = scrapy.Field()
        # 信息
        bd = scrapy.Field()
        # 评分
        star = scrapy.Field()
        # 简介
        quote = scrapy.Field()

    doubanmovie.py

    # -*- coding: utf-8 -*-
    import scrapy
    from douban.items import DoubanItem
    
    class DoubamovieSpider(scrapy.Spider):
        name = "doubanmovie"
        allowed_domains = ["movie.douban.com"]
        offset = 0
        url = "https://movie.douban.com/top250?start="
        start_urls = (
                url+str(offset),
        )
    
        def parse(self, response):
            item = DoubanItem()
            movies = response.xpath("//div[@class='info']")
    
            for each in movies:
                # 标题
                item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0]
                # 信息
                item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
                # 评分
                item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0]
                # 简介
                quote = each.xpath(".//p[@class='quote']/span/text()").extract()
                if len(quote) != 0:
                    item['quote'] = quote[0]
                yield item
    
            if self.offset < 225:
                self.offset += 25
                yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

     pipelines.py

    # -*- coding: utf-8 -*-
    
    import pymongo
    from scrapy.conf import settings
    
    class DoubanPipeline(object):
        def __init__(self):
            host = settings["MONGODB_HOST"]
            port = settings["MONGODB_PORT"]
            dbname = settings["MONGODB_DBNAME"]
            sheetname= settings["MONGODB_SHEETNAME"]
    
            # 创建MONGODB数据库链接
            client = pymongo.MongoClient(host = host, port = port)
            # 指定数据库
            mydb = client[dbname]
            # 存放数据的数据库表名
            self.sheet = mydb[sheetname]
    
        def process_item(self, item, spider):
            data = dict(item)
            self.sheet.insert(data)
            return item

    settings.py

    DOWNLOAD_DELAY = 2.5
    
    COOKIES_ENABLED = False
    
    DOWNLOADER_MIDDLEWARES = {
        'douban.middlewares.RandomUserAgent': 100,
        'douban.middlewares.RandomProxy': 200,
    }
    
    USER_AGENTS = [
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
        'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
        'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
        'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
        'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
    ]
    
    PROXIES = [
            {"ip_port" :"121.42.140.113:16816", "user_passwd" : "****"},
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
    ]
    
    
    ITEM_PIPELINES = {
        'douban.pipelines.DoubanPipeline': 300,
    }
    
    
    # MONGODB 主机名
    MONGODB_HOST = "127.0.0.1"
    
    # MONGODB 端口号
    MONGODB_PORT = 27017
    
    # 数据库名称
    MONGODB_DBNAME = "Douban"
    
    # 存放数据的表名称
    MONGODB_SHEETNAME = "doubanmovies"

    middlewares.py

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    import random
    import base64
    
    from settings import USER_AGENTS
    from settings import PROXIES
    
    # 随机的User-Agent
    class RandomUserAgent(object):
        def process_request(self, request, spider):
            useragent = random.choice(USER_AGENTS)
            #print useragent
            request.headers.setdefault("User-Agent", useragent)
    
    class RandomProxy(object):
        def process_request(self, request, spider):
            proxy = random.choice(PROXIES)
    
            if proxy['user_passwd'] is None:
                # 没有代理账户验证的代理使用方式
                request.meta['proxy'] = "http://" + proxy['ip_port']
    
            else:
                # 对账户密码进行base64编码转换
                base64_userpasswd = base64.b64encode(proxy['user_passwd'])
                # 对应到代理服务器的信令格式里
                request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
    
                request.meta['proxy'] = "http://" + proxy['ip_port']
     
  • 相关阅读:
    Angular随笔第一课
    web前端面试集锦(自己搜集的,如有错误请不吝赐教)
    菜单选项卡-2中方法加载选项卡的内容
    dialog使用方法(同一页面,调用一个js代码,实现多个不同样式的弹窗)
    JAVA中对象赋值以及前拷贝,深拷贝等~~
    基类指针指向派生类对象
    JAVA中方法的参数传递(转)
    java和C++的const 和 final 的区别
    JAVA--虚函数,抽象函数,抽象类,接口
    libdash编译中遇到的问题及解决方案
  • 原文地址:https://www.cnblogs.com/Summer-skr--blog/p/11495918.html
Copyright © 2020-2023  润新知