• 新浪微博爬虫项目


    利用scrapy、proxy_pool、cookie_pool抓取新浪微博:用户信息、关注列表、粉丝列表、微博内容,信息保存至MongoDB。以几个大V为起点,爬取个人信息、粉丝、关注、微博信息,然后继续获取这些粉丝和关注的个人信息、粉丝、关注、微博信息,以此类推,实现递归爬取。
    1、    spider.py:请求网页,反爬:pc端困难,数据是ajax请求,containerid参数,
    解决:转移动端(m.weibo.cn/u/uid),请求json数据,参数:230283/100505/107603/231051区别
    2、    middlewares:调用proxy_pool,cookie_pool,随机获取代理或cookie
    3、    Pipeline:item对象中有’crawlet_at’代表当前爬取时间,使用TimePipeline,赋值当前时间;item对象中有’created_at’代表评论时间(刚刚、几分钟前…),使用WeiboPipeline,进行时间处理;数据库使用MongoPipeline,进行连接,存储
    4、    技术:scrapy,MongoDB,proxy_pool,cookie_pool,time
    介绍:

    weibo.py

    import json
    from scrapy import Request, Spider
    from weibo.items import *
    
    
    class WeiboSpider(Spider):
        name = 'weibocn'
        
        allowed_domains = ['m.weibo.cn']
        
        user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}'
        
        follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
        
        fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}'
        
        weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}'
        
        start_users = ['3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096']
        
        def start_requests(self):
            for uid in self.start_users:
                yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
        
        def parse_user(self, response):
            """
            解析用户信息
            :param response: Response对象
            """
            self.logger.debug(response)
            result = json.loads(response.text)
            if result.get('data').get('userInfo'):
                user_info = result.get('data').get('userInfo')
                user_item = UserItem()
                field_map = {
                    'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone',
                    'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count',
                    'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified',
                    'verified_reason': 'verified_reason', 'verified_type': 'verified_type'
                }
                for field, attr in field_map.items():
                    user_item[field] = user_info.get(attr)
                yield user_item
                # 关注
                uid = user_info.get('id')
                yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows,
                              meta={'page': 1, 'uid': uid})
                # 粉丝
                yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans,
                              meta={'page': 1, 'uid': uid})
                # 微博
                yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
                              meta={'page': 1, 'uid': uid})
        
        def parse_follows(self, response):
            """
            解析用户关注
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) 
                    and result.get('data').get('cards')[-1].get('card_group'):
                # 解析用户
                follows = result.get('data').get('cards')[-1].get('card_group')
                for follow in follows:
                    if follow.get('user'):
                        uid = follow.get('user').get('id')
                        yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
                
                uid = response.meta.get('uid')
                # 关注列表
                user_relation_item = UserRelationItem()
                follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in
                           follows]
                user_relation_item['id'] = uid
                user_relation_item['follows'] = follows
                user_relation_item['fans'] = []
                yield user_relation_item
                # 下一页关注
                page = response.meta.get('page') + 1
                yield Request(self.follow_url.format(uid=uid, page=page),
                              callback=self.parse_follows, meta={'page': page, 'uid': uid})
        
        def parse_fans(self, response):
            """
            解析用户粉丝
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
                'card_group'):
                # 解析用户
                fans = result.get('data').get('cards')[-1].get('card_group')
                for fan in fans:
                    if fan.get('user'):
                        uid = fan.get('user').get('id')
                        yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
                
                uid = response.meta.get('uid')
                # 粉丝列表
                user_relation_item = UserRelationItem()
                fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in
                        fans]
                user_relation_item['id'] = uid
                user_relation_item['fans'] = fans
                user_relation_item['follows'] = []
                yield user_relation_item
                # 下一页粉丝
                page = response.meta.get('page') + 1
                yield Request(self.fan_url.format(uid=uid, page=page),
                              callback=self.parse_fans, meta={'page': page, 'uid': uid})
        
        def parse_weibos(self, response):
            """
            解析微博列表
            :param response: Response对象
            """
            result = json.loads(response.text)
            if result.get('ok') and result.get('data').get('cards'):
                weibos = result.get('data').get('cards')
                for weibo in weibos:
                    mblog = weibo.get('mblog')
                    if mblog:
                        weibo_item = WeiboItem()
                        field_map = {
                            'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count',
                            'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics',
                            'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text',
                            'thumbnail': 'thumbnail_pic',
                        }
                        for field, attr in field_map.items():
                            weibo_item[field] = mblog.get(attr)
                        weibo_item['user'] = response.meta.get('uid')
                        yield weibo_item
                # 下一页微博
                uid = response.meta.get('uid')
                page = response.meta.get('page') + 1
                yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos,
                              meta={'uid': uid, 'page': page})
    View Code

    middleware.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
    import json
    import logging
    from scrapy import signals
    import requests
    
    
    class ProxyMiddleware():
        def __init__(self, proxy_url):
            self.logger = logging.getLogger(__name__)
            self.proxy_url = proxy_url
        
        def get_random_proxy(self):
            try:
                response = requests.get(self.proxy_url)
                if response.status_code == 200:
                    proxy = response.text
                    return proxy
            except requests.ConnectionError:
                return False
        
        def process_request(self, request, spider):
            if request.meta.get('retry_times'):
                proxy = self.get_random_proxy()
                if proxy:
                    uri = 'https://{proxy}'.format(proxy=proxy)
                    self.logger.debug('使用代理 ' + proxy)
                    request.meta['proxy'] = uri
    
        @classmethod
        def from_crawler(cls, crawler):
            settings = crawler.settings
            return cls(
                proxy_url=settings.get('PROXY_URL')
            )
    
    
    class CookiesMiddleware():
        def __init__(self, cookies_url):
            self.logger = logging.getLogger(__name__)
            self.cookies_url = cookies_url
        
        def get_random_cookies(self):
            try:
                response = requests.get(self.cookies_url)
                if response.status_code == 200:
                    cookies = json.loads(response.text)
                    return cookies
            except requests.ConnectionError:
                return False
        
        def process_request(self, request, spider):
            self.logger.debug('正在获取Cookies')
            cookies = self.get_random_cookies()
            if cookies:
                request.cookies = cookies
                self.logger.debug('使用Cookies ' + json.dumps(cookies))
        
        @classmethod
        def from_crawler(cls, crawler):
            settings = crawler.settings
            return cls(
                cookies_url=settings.get('COOKIES_URL')
            )
    View Code

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import re, time
    
    import logging
    import pymongo
    
    from weibo.items import *
    
    
    class TimePipeline():
        def process_item(self, item, spider):
            if isinstance(item, UserItem) or isinstance(item, WeiboItem):
                now = time.strftime('%Y-%m-%d %H:%M', time.localtime())
                item['crawled_at'] = now
            return item
    
    
    class WeiboPipeline():
        def parse_time(self, date):
            if re.match('刚刚', date):
                date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))
            if re.match('d+分钟前', date):
                minute = re.match('(d+)', date).group(1)
                date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
            if re.match('d+小时前', date):
                hour = re.match('(d+)', date).group(1)
                date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60))
            if re.match('昨天.*', date):
                date = re.match('昨天(.*)', date).group(1).strip()
                date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date
            if re.match('d{2}-d{2}', date):
                date = time.strftime('%Y-', time.localtime()) + date + ' 00:00'
            return date
        
        def process_item(self, item, spider):
            if isinstance(item, WeiboItem):
                if item.get('created_at'):
                    item['created_at'] = item['created_at'].strip()
                    item['created_at'] = self.parse_time(item.get('created_at'))
                if item.get('pictures'):
                    item['pictures'] = [pic.get('url') for pic in item.get('pictures')]
            return item
    
    
    class MongoPipeline(object):
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
        
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DATABASE')
            )
        
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
            self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)])
            self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)])
        
        def close_spider(self, spider):
            self.client.close()
        
        def process_item(self, item, spider):
            if isinstance(item, UserItem) or isinstance(item, WeiboItem):
                self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True)
            if isinstance(item, UserRelationItem):
                self.db[item.collection].update(
                    {'id': item.get('id')},
                    {'$addToSet':
                        {
                            'follows': {'$each': item['follows']},
                            'fans': {'$each': item['fans']}
                        }
                    }, True)
            return item
    View Code
    settings.py
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for weibo project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'weibo'
    
    SPIDER_MODULES = ['weibo.spiders']
    NEWSPIDER_MODULE = 'weibo.spiders'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'weibo (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    DEFAULT_REQUEST_HEADERS = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Connection': 'keep-alive',
        'Host': 'm.weibo.cn',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    # DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    # }
    
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    # SPIDER_MIDDLEWARES = {
    #    'weibo.middlewares.WeiboSpiderMiddleware': 543,
    # }
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
        'weibo.middlewares.CookiesMiddleware': 554,
        'weibo.middlewares.ProxyMiddleware': 555,
    }
    
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # }
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'weibo.pipelines.TimePipeline': 300,
        'weibo.pipelines.WeiboPipeline': 301,
        'weibo.pipelines.MongoPipeline': 302,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    
    MONGO_URI = 'localhost'
    
    MONGO_DATABASE = 'weibo'
    
    COOKIES_URL = 'http://localhost:5000/weibo/random'
    
    PROXY_URL = 'http://localhost:5555/random'
    View Code
    内置的CookiesMiddleware的优先级为700
    内置的HttpProxyMiddleware的优先级为750

     

  • 相关阅读:
    jQuery的简单应用
    JQuery事件
    [django]数据导出excel升级强化版(很强大!)
    [Django]用户权限学习系列之权限管理界面实现
    [Django]用户权限学习系列之设计自有权限管理系统设计思路
    [jquery]显示隐藏div标签的几种方法
    [Django]用户权限学习系列之User权限基本操作指令
    [jquery]jquery正则表达式验证(手机号、身份证号、中文名称)
    [Django]用户权限学习系列之Permission权限基本操作指令
    [python]set集合学习
  • 原文地址:https://www.cnblogs.com/nick477931661/p/9166120.html
Copyright © 2020-2023  润新知