• Scrapy爬虫案例 | 数据存储至MongoDB


     

    豆瓣电影TOP 250网址

    要求:

    1.爬取豆瓣top 250电影名字、演员列表、评分和简介

    2.设置随机UserAgent和Proxy

    3.爬取到的数据保存到MongoDB数据库

     items.py

    # -*- coding: utf-8 -*-
    
    import scrapy
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        # 标题
        title = scrapy.Field()
        # 信息
        bd = scrapy.Field()
        # 评分
        star = scrapy.Field()
        # 简介
        quote = scrapy.Field()

    doubanmovie.py

    # -*- coding: utf-8 -*-
    import scrapy
    from douban.items import DoubanItem
    
    class DoubamovieSpider(scrapy.Spider):
        name = "doubanmovie"
        allowed_domains = ["movie.douban.com"]
        offset = 0
        url = "https://movie.douban.com/top250?start="
        start_urls = (
                url+str(offset),
        )
    
        def parse(self, response):
            item = DoubanItem()
            movies = response.xpath("//div[@class='info']")
    
            for each in movies:
                # 标题
                item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0]
                # 信息
                item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
                # 评分
                item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0]
                # 简介
                quote = each.xpath(".//p[@class='quote']/span/text()").extract()
                if len(quote) != 0:
                    item['quote'] = quote[0]
                yield item
    
            if self.offset < 225:
                self.offset += 25
                yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

     pipelines.py

    # -*- coding: utf-8 -*-
    
    import pymongo
    from scrapy.conf import settings
    
    class DoubanPipeline(object):
        def __init__(self):
            host = settings["MONGODB_HOST"]
            port = settings["MONGODB_PORT"]
            dbname = settings["MONGODB_DBNAME"]
            sheetname= settings["MONGODB_SHEETNAME"]
    
            # 创建MONGODB数据库链接
            client = pymongo.MongoClient(host = host, port = port)
            # 指定数据库
            mydb = client[dbname]
            # 存放数据的数据库表名
            self.sheet = mydb[sheetname]
    
        def process_item(self, item, spider):
            data = dict(item)
            self.sheet.insert(data)
            return item

    settings.py

    DOWNLOAD_DELAY = 2.5
    
    COOKIES_ENABLED = False
    
    DOWNLOADER_MIDDLEWARES = {
        'douban.middlewares.RandomUserAgent': 100,
        'douban.middlewares.RandomProxy': 200,
    }
    
    USER_AGENTS = [
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
        'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
        'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
        'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
        'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
    ]
    
    PROXIES = [
            {"ip_port" :"121.42.140.113:16816", "user_passwd" : "****"},
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
    ]
    
    
    ITEM_PIPELINES = {
        'douban.pipelines.DoubanPipeline': 300,
    }
    
    
    # MONGODB 主机名
    MONGODB_HOST = "127.0.0.1"
    
    # MONGODB 端口号
    MONGODB_PORT = 27017
    
    # 数据库名称
    MONGODB_DBNAME = "Douban"
    
    # 存放数据的表名称
    MONGODB_SHEETNAME = "doubanmovies"

    middlewares.py

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    import random
    import base64
    
    from settings import USER_AGENTS
    from settings import PROXIES
    
    # 随机的User-Agent
    class RandomUserAgent(object):
        def process_request(self, request, spider):
            useragent = random.choice(USER_AGENTS)
            #print useragent
            request.headers.setdefault("User-Agent", useragent)
    
    class RandomProxy(object):
        def process_request(self, request, spider):
            proxy = random.choice(PROXIES)
    
            if proxy['user_passwd'] is None:
                # 没有代理账户验证的代理使用方式
                request.meta['proxy'] = "http://" + proxy['ip_port']
    
            else:
                # 对账户密码进行base64编码转换
                base64_userpasswd = base64.b64encode(proxy['user_passwd'])
                # 对应到代理服务器的信令格式里
                request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
    
                request.meta['proxy'] = "http://" + proxy['ip_port']
     
  • 相关阅读:
    [AS3]AMF3+JAVA的调用范例
    SmartFoxServer,多人flash游戏开发
    Flex2 发现之旅:在Flex中嵌入完整HTML页面
    让.Net 程序脱离.net framework框架运行
    ASP.NET 数据列表控件的分页总结
    中国移动飞信免费发短信API接口(第三方 Fetion API)[原创]
    Ubuntu下安装wine详细介绍
    SQL SERVER 2008 R2 序列号大全
    linux 下安装ATI 显卡驱动
    jdk 环境变量设置
  • 原文地址:https://www.cnblogs.com/Summer-skr--blog/p/11495918.html
Copyright © 2020-2023  润新知