利用scrapy、proxy_pool、cookie_pool抓取新浪微博:用户信息、关注列表、粉丝列表、微博内容,信息保存至MongoDB。以几个大V为起点,爬取个人信息、粉丝、关注、微博信息,然后继续获取这些粉丝和关注的个人信息、粉丝、关注、微博信息,以此类推,实现递归爬取。 1、 spider.py:请求网页,反爬:pc端困难,数据是ajax请求,containerid参数, 解决:转移动端(m.weibo.cn/u/uid),请求json数据,参数:230283/100505/107603/231051区别 2、 middlewares:调用proxy_pool,cookie_pool,随机获取代理或cookie 3、 Pipeline:item对象中有’crawlet_at’代表当前爬取时间,使用TimePipeline,赋值当前时间;item对象中有’created_at’代表评论时间(刚刚、几分钟前…),使用WeiboPipeline,进行时间处理;数据库使用MongoPipeline,进行连接,存储 4、 技术:scrapy,MongoDB,proxy_pool,cookie_pool,time
weibo.py
import json from scrapy import Request, Spider from weibo.items import * class WeiboSpider(Spider): name = 'weibocn' allowed_domains = ['m.weibo.cn'] user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}' follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}' fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}' weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}' start_users = ['3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096'] def start_requests(self): for uid in self.start_users: yield Request(self.user_url.format(uid=uid), callback=self.parse_user) def parse_user(self, response): """ 解析用户信息 :param response: Response对象 """ self.logger.debug(response) result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' } for field, attr in field_map.items(): user_item[field] = user_info.get(attr) yield user_item # 关注 uid = user_info.get('id') yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={'page': 1, 'uid': uid}) # 粉丝 yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={'page': 1, 'uid': uid}) # 微博 yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, meta={'page': 1, 'uid': uid}) def parse_follows(self, response): """ 解析用户关注 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get('card_group'): # 解析用户 follows = result.get('data').get('cards')[-1].get('card_group') for follow in follows: if follow.get('user'): uid = follow.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 关注列表 user_relation_item = UserRelationItem() follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in follows] user_relation_item['id'] = uid user_relation_item['follows'] = follows user_relation_item['fans'] = [] yield user_relation_item # 下一页关注 page = response.meta.get('page') + 1 yield Request(self.follow_url.format(uid=uid, page=page), callback=self.parse_follows, meta={'page': page, 'uid': uid}) def parse_fans(self, response): """ 解析用户粉丝 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get( 'card_group'): # 解析用户 fans = result.get('data').get('cards')[-1].get('card_group') for fan in fans: if fan.get('user'): uid = fan.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 粉丝列表 user_relation_item = UserRelationItem() fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in fans] user_relation_item['id'] = uid user_relation_item['fans'] = fans user_relation_item['follows'] = [] yield user_relation_item # 下一页粉丝 page = response.meta.get('page') + 1 yield Request(self.fan_url.format(uid=uid, page=page), callback=self.parse_fans, meta={'page': page, 'uid': uid}) def parse_weibos(self, response): """ 解析微博列表 :param response: Response对象 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): weibos = result.get('data').get('cards') for weibo in weibos: mblog = weibo.get('mblog') if mblog: weibo_item = WeiboItem() field_map = { 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 'thumbnail': 'thumbnail_pic', } for field, attr in field_map.items(): weibo_item[field] = mblog.get(attr) weibo_item['user'] = response.meta.get('uid') yield weibo_item # 下一页微博 uid = response.meta.get('uid') page = response.meta.get('page') + 1 yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, meta={'uid': uid, 'page': page})
middleware.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html import json import logging from scrapy import signals import requests class ProxyMiddleware(): def __init__(self, proxy_url): self.logger = logging.getLogger(__name__) self.proxy_url = proxy_url def get_random_proxy(self): try: response = requests.get(self.proxy_url) if response.status_code == 200: proxy = response.text return proxy except requests.ConnectionError: return False def process_request(self, request, spider): if request.meta.get('retry_times'): proxy = self.get_random_proxy() if proxy: uri = 'https://{proxy}'.format(proxy=proxy) self.logger.debug('使用代理 ' + proxy) request.meta['proxy'] = uri @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls( proxy_url=settings.get('PROXY_URL') ) class CookiesMiddleware(): def __init__(self, cookies_url): self.logger = logging.getLogger(__name__) self.cookies_url = cookies_url def get_random_cookies(self): try: response = requests.get(self.cookies_url) if response.status_code == 200: cookies = json.loads(response.text) return cookies except requests.ConnectionError: return False def process_request(self, request, spider): self.logger.debug('正在获取Cookies') cookies = self.get_random_cookies() if cookies: request.cookies = cookies self.logger.debug('使用Cookies ' + json.dumps(cookies)) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls( cookies_url=settings.get('COOKIES_URL') )
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import re, time import logging import pymongo from weibo.items import * class TimePipeline(): def process_item(self, item, spider): if isinstance(item, UserItem) or isinstance(item, WeiboItem): now = time.strftime('%Y-%m-%d %H:%M', time.localtime()) item['crawled_at'] = now return item class WeiboPipeline(): def parse_time(self, date): if re.match('刚刚', date): date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())) if re.match('d+分钟前', date): minute = re.match('(d+)', date).group(1) date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60)) if re.match('d+小时前', date): hour = re.match('(d+)', date).group(1) date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60)) if re.match('昨天.*', date): date = re.match('昨天(.*)', date).group(1).strip() date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date if re.match('d{2}-d{2}', date): date = time.strftime('%Y-', time.localtime()) + date + ' 00:00' return date def process_item(self, item, spider): if isinstance(item, WeiboItem): if item.get('created_at'): item['created_at'] = item['created_at'].strip() item['created_at'] = self.parse_time(item.get('created_at')) if item.get('pictures'): item['pictures'] = [pic.get('url') for pic in item.get('pictures')] return item class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)]) self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)]) def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if isinstance(item, UserItem) or isinstance(item, WeiboItem): self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True) if isinstance(item, UserRelationItem): self.db[item.collection].update( {'id': item.get('id')}, {'$addToSet': { 'follows': {'$each': item['follows']}, 'fans': {'$each': item['fans']} } }, True) return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for weibo project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'weibo' SPIDER_MODULES = ['weibo.spiders'] NEWSPIDER_MODULE = 'weibo.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'weibo (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Connection': 'keep-alive', 'Host': 'm.weibo.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'weibo.middlewares.WeiboSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'weibo.middlewares.CookiesMiddleware': 554, 'weibo.middlewares.ProxyMiddleware': 555, } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'weibo.pipelines.TimePipeline': 300, 'weibo.pipelines.WeiboPipeline': 301, 'weibo.pipelines.MongoPipeline': 302, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' MONGO_URI = 'localhost' MONGO_DATABASE = 'weibo' COOKIES_URL = 'http://localhost:5000/weibo/random' PROXY_URL = 'http://localhost:5555/random'
内置的CookiesMiddleware的优先级为700 内置的HttpProxyMiddleware的优先级为750