1、首先定义容器,爬取资源的字段,items.py文件
class CDErshouFang(scrapy.Item):
"贝壳二手房"
house_name = scrapy.Field() #小区名称
house_address = scrapy.Field() #小区地址
house_info = scrapy.Field() # 房子信息:楼层、建造时间、户型、建造面积、朝向
release_time = scrapy.Field() # 发布时间
house_tags = scrapy.Field() # 标签
price = scrapy.Field() # 均价
total_price = scrapy.Field() # 总价
details = scrapy.Field() #详情页-所在区域
trading_ownership = scrapy.Field() #交易权属
commodity_use = scrapy.Field() #商品用途
house_year = scrapy.Field() #房屋年限
property = scrapy.Field() #产权所属
mortgage_information = scrapy.Field() #抵押信息
room_spare = scrapy.Field() #房本备件
2、编写爬虫文件,解析下载器下载的网页资源,spiders目录下的爬虫文件。
1 import scrapy
2 from scrapystudy.items import CDErshouFang
3
4
5 class CdErshoufangSpider(scrapy.Spider):
6 name = 'cd_ershoufang'
7 allowed_domains = ['cd.ke.com']
8 start_urls = ['https://cd.ke.com/ershoufang/']
9
10 def start_requests(self):
11 "重写start_requests,爬虫将从此处运行url"
12
13 for page in range(1,100):
14 url = self.start_urls[0] + 'pg' + str(page) + '/'
15 yield scrapy.Request(url=url,callback=self.parse,dont_filter=True)
16
17 def parse(self, response):
18 SET_SELECT = response.css('.info') #*****,此处定位不正确关系下面的信息读取
19 for cle in SET_SELECT:
20 item = CDErshouFang()
21 house_name = cle.css('.title a::text').extract_first() #用的是cle选择器(下载器下载的网页,不能直接切换操作)
22 house_address = cle.css('.positionInfo a::text').extract_first()
23 house_info = cle.css('.houseInfo::text').extract()[1].replace(' ','').replace('
','')
24 release_time = cle.css('.followInfo::text').extract()[1].replace(' ','').replace('
','')
25 price_total = cle.css('.priceInfo .totalPrice span::text').extract_first()
26 if price_total is not None:
27 price_total = price_total + '万'
28 price = cle.css('.unitPrice span::text').extract_first()
29 # house_tags = cle.css('.info .address .tag span::text').extract()
30 item["house_name"] = house_name
31 item["house_address"] = house_address
32 item["house_info"] = house_info
33 item["release_time"] = release_time
34 item["total_price"] = price_total
35 item["price"] = price
36 # item["house_tags"] = house_tags
37 details_page_url = cle.css('.title a::attr(href)').extract_first() #详情页超链接
38 # meta:把需要传递的信息赋值给这个叫meta的变量(字典类型),Request中meta参数的作用是传递信息给下一个函数
39 yield scrapy.Request(url=details_page_url,callback=self.details,meta={'item':item})
40
41 def details(self,response):
42 "详情页数据获取"
43 area = response.xpath('//span[@class="info"]/a[1]/text()').extract_first() #区
44 details = response.xpath('//span[@class="info"]/a[last()]/text()').extract_first()
45 if area is not None or details is not None:
46 details = area + ' ' + details
47 trading_ownership = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[2]/text()').extract_first().strip() # 交易权属
48 commodity_use = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[4]/text()').extract_first().strip() # 房屋用途
49 house_year = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[5]/text()').extract_first().strip() # 房屋年限
50 property = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[6]/text()').extract_first().strip() # 产权所属
51 mortgage_information = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[7]/span[2]/text()').extract_first().strip() # 抵押信息
52 room_spare = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[8]/text()').extract_first().strip() # 房本备件
53 item = response.meta['item'] #取出上一个页面爬取的信息
54 item["details"] = details
55 item["trading_ownership"] = trading_ownership
56 item["commodity_use"] = commodity_use
57 item["house_year"] = house_year
58 item["property"] = property
59 item["mortgage_information"] = mortgage_information
60 item["room_spare"] = room_spare
61 yield item
3、处理爬取的数据,pipelines.py文件编写数据存储在MongoDB数据库或者yaml、json文件
1 # Define your item pipelines here
2 #
3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
6
7 # useful for handling different item types with a single interface
8 from itemadapter import ItemAdapter
9 from scrapy.exceptions import DropItem
10 import pymongo
11
12 class TextPipeline:
13 "要是用pipelines要先注册,去setting.py添加配置"
14
15 def __init__(self):
16 self.limit = 50
17
18 def process_item(self, item, spider):
19 "处理items长度"
20 if item['title']:
21 if len(item['title'])>self.limit:
22 item['title'] = item['title'][0:self.limit].rstrip()+'...'
23 return item
24 else:
25 return DropItem('Missing Text')
26
27 class MongoPipeline(object):
28 "将数据存储在MongoDB中"
29
30 def __init__(self,mongo_url,mongo_db):
31 self.mongo_url = mongo_url
32 self.mongo_db = mongo_db
33
34 @classmethod
35 def from_crawler(cls,crawler):
36 "获取到setting全局的配置"
37 return cls(
38 mongo_url = crawler.settings.get('MONGO_URL'),
39 mongo_db = crawler.settings.get('MONGO_DB')
40 )
41
42 def open_spider(self,spider):
43 self.client = pymongo.MongoClient(self.mongo_url)
44 self.db = self.client[self.mongo_db]
45
46 def process_item(self,item,spider):
47 name = item.__class__.__name__
48 self.db[name].insert(dict(item))
49 return item
50
51 def close_spider(self,spider):
52 self.client.close()
53
54 import os
55 import time
56 import logging
57 import yaml
58 logger = logging.getLogger(__name__)
59
60 class SaveBeikePipeline(object):
61 "保存爬虫数据到yaml、json文件"
62
63 def open_spider(self,spider):
64 "spider打开时启动,该方法被调用,这里打开或新建一个文件"
65 filetime = time.strftime("%Y%m%d")
66 filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
67 if not os.path.exists(filepath) : os.mkdir(filepath)
68 # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
69 spider_file = filepath + '{}.yaml' .format(filetime)
70 try:
71 self.f = open(spider_file, mode='w', encoding='utf-8')
72 except Exception as e:
73 logger.error(e)
74
75 def process_item(self,item,spider):
76 "处理数据"
77 data = dict()
78 data["小区名称"] = item["house_name"]
79 data["在售状态"] = item["on_sale"]
80 data["房屋类型"] = item["house_type"]
81 data["小区地址"] = item["address"]
82 data["房屋户型"] = item["door_module"]
83 data["建筑面积"] = item["area"]
84 data["价格"] = item["price"]
85 data["总价/套"] = item["total_price"]
86 data["附近设施"] = item["tags"]
87 # self.f.write(str(data)+'
')
88 spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示
89 self.f.write(spider_data+'*'.center(50,'-')+'
')
90 return item
91
92 def close_spider(self,spider):
93 "scrapy结束时启动,用来关掉文件"
94 self.f.close()
95
96 class SaveCDershouFangPipeline(object):
97 "保存爬虫数据到yaml"
98
99 def open_spider(self,spider):
100 "spider打开时启动,该方法被调用,这里打开或新建一个文件"
101 filetime = time.strftime("%Y%m%d")
102 filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
103 if not os.path.exists(filepath) : os.mkdir(filepath)
104 # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
105 spider_file = filepath + 'cd_ershoufang{}.yaml' .format(filetime)
106 try:
107 self.f = open(spider_file, mode='w', encoding='utf-8')
108 except Exception as e:
109 logger.error(e)
110
111 def process_item(self,item,spider):
112 "处理数据"
113 data = dict()
114 data["小区名称"] = item["house_name"]
115 data["小区地址"] = item["house_address"]
116 data["房子信息"] = item["house_info"]
117 data["发布时间"] = item["release_time"]
118 data["总价/套"] = item["total_price"]
119 data["均价"] = item["price"]
120 # data["标签"] = item["house_tags"]
121 data["所在区域"] = item["details"]
122 data["交易权属"] = item["trading_ownership"]
123 data["房屋用途"] = item["commodity_use"]
124 data["房屋年限"] = item["house_year"]
125 data["产权所属"] = item["property"]
126 data["抵押信息"] = item["mortgage_information"]
127 data["房本备件"] = item["room_spare"]
128 spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示,默认为True
129 self.f.write(spider_data+'*'.center(60,'-')+'
')
130 return item
131
132 def close_spider(self,spider):
133 "scrapy结束时启动,用来关掉文件"
134 self.f.close()
4、部分页面分页无法获取网页,需要与selenium结合起来,将selenium操作的资源封装成response资源并发送给spiders进行解析,middlewares.py
1 # Define here the models for your spider middleware
2 #
3 # See documentation in:
4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
6 from scrapy import signals
7
8 # useful for handling different item types with a single interface
9 from itemadapter import is_item, ItemAdapter
10
11
12 class ScrapystudySpiderMiddleware:
13 # Not all methods need to be defined. If a method is not defined,
14 # scrapy acts as if the spider middleware does not modify the
15 # passed objects.
16
17 @classmethod
18 def from_crawler(cls, crawler):
19 # This method is used by Scrapy to create your spiders.
20 s = cls()
21 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 return s
23
24 def process_spider_input(self, response, spider):
25 # Called for each response that goes through the spider
26 # middleware and into the spider.
27
28 # Should return None or raise an exception.
29 return None
30
31 def process_spider_output(self, response, result, spider):
32 # Called with the results returned from the Spider, after
33 # it has processed the response.
34
35 # Must return an iterable of Request, or item objects.
36 for i in result:
37 yield i
38
39 def process_spider_exception(self, response, exception, spider):
40 # Called when a spider or process_spider_input() method
41 # (from other spider middleware) raises an exception.
42
43 # Should return either None or an iterable of Request or item objects.
44 pass
45
46 def process_start_requests(self, start_requests, spider):
47 # Called with the start requests of the spider, and works
48 # similarly to the process_spider_output() method, except
49 # that it doesn’t have a response associated.
50
51 # Must return only requests (not items).
52 for r in start_requests:
53 yield r
54
55 def spider_opened(self, spider):
56 spider.logger.info('Spider opened: %s' % spider.name)
57
58
59 class ScrapystudyDownloaderMiddleware:
60 # Not all methods need to be defined. If a method is not defined,
61 # scrapy acts as if the downloader middleware does not modify the
62 # passed objects.
63
64 @classmethod
65 def from_crawler(cls, crawler):
66 # This method is used by Scrapy to create your spiders.
67 s = cls()
68 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 return s
70
71 def process_request(self, request, spider):
72 # Called for each request that goes through the downloader
73 # middleware.
74 # request.cookie = {
75 # "Cookie":"__mta=108386109.1609123577452.1610351007435.1610351353409.13; __mta=108386109.1609123577452.1610351353409.1610362706394.14; uuid_n_v=v1; _lxsdk_cuid=176a73d3e42c8-057a36937583e8-c791039-149c48-176a73d3e42c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; _csrf=1d012800348e02304158b04bcaacdb15959e3482e6847893721b340ca6f29323; lt=8kvWp1o5sQYEgkrZTHbti6H0uI8AAAAAhgwAADxF8ufwXVyR4TU3_BGMHAKsB_TA6toYFjxg-m34Z43vNJlCb9Bv05PqTeelhSHITw; lt.sig=iPSGNXFnd3jV3SEy7wzqa0L_QOw; uid=2829236546; uid.sig=fiHM__7YgLUMEaZ05TkEQaVApbs; _lxsdk=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1609123577,1609148969,1610350992,1610362253; __mta=108386109.1609123577452.1610362628562.1610362689900.15; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1610362706; _lxsdk_s=176f0edcffa-620-f33-c24%7C%7C53",
76 # "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
77 # }
78
79 # Must either:
80 # - return None: continue processing this request
81 # - or return a Response object
82 # - or return a Request object
83 # - or raise IgnoreRequest: process_exception() methods of
84 # installed downloader middleware will be called
85 return None
86
87 def process_response(self, request, response, spider):
88 # Called with the response returned from the downloader.
89
90 # Must either;
91 # - return a Response object
92 # - return a Request object
93 # - or raise IgnoreRequest
94 return response
95
96 def process_exception(self, request, exception, spider):
97 # Called when a download handler or a process_request()
98 # (from other downloader middleware) raises an exception.
99
100 # Must either:
101 # - return None: continue processing this exception
102 # - return a Response object: stops process_exception() chain
103 # - return a Request object: stops process_exception() chain
104 pass
105
106 def spider_opened(self, spider):
107 spider.logger.info('Spider opened: %s' % spider.name)
108
109 # import logging
110 # class ProxyMiddleware(object):
111 # "设置中间件代理"
112 # logger = logging.getLogger(__name__)
113 # def process_request(self,request,spider):
114 # self.logger.debug("Using Proxy")
115 # request.meta["proxy"] = "http://125.87.105.4:49713"
116
117 from selenium import webdriver
118 from selenium.webdriver.common.by import By
119 from selenium.webdriver.support.ui import WebDriverWait
120 from selenium.webdriver.support import expected_conditions as EC
121 from selenium.common.exceptions import TimeoutException
122 from scrapy.http import HtmlResponse
123 from selenium.webdriver.chrome.options import Options
124 import logging
125 import time
126
127 logger = logging.getLogger(__name__)
128
129 class SeleniumMiddleware(object): #??如何将多个HtnlResponse对象传给spider进行解析????
130
131 def process_request(self,request,spider):
132 url = request.url
133 opt = Options()
134 opt.add_argument('--headless')
135 # 创建谷歌浏览器对象
136 browser = webdriver.Chrome()
137 wait = WebDriverWait(browser,10)
138 browser.get(url)
139 htmls = []
140 for page in range(2,3):
141 try:
142 next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > a.next")))
143 next_page.click()
144 # 判断当前页码是否为当前页
145 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > span.active"),str(page)))
146 except TimeoutException:
147 continue
148 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
149 time.sleep(2)
150 html = browser.page_source # 返回网页源码
151 logger.info("获取到的URL:"+request.url)
152 # browser.quit()
153 return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8')
5、激活pipelines项目管道和middlewares中间件,setting.py
1 # Scrapy settings for scrapystudy project
2 #
3 # For simplicity, this file contains only settings considered important or
4 # commonly used. You can find more settings consulting the documentation:
5 #
6 # https://docs.scrapy.org/en/latest/topics/settings.html
7 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
10 BOT_NAME = 'scrapystudy'
11
12 SPIDER_MODULES = ['scrapystudy.spiders']
13 NEWSPIDER_MODULE = 'scrapystudy.spiders'
14
15 MONGO_URL = "localhost"
16 MONGO_DB = "mydb"
17
18
19 # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 #USER_AGENT = 'scrapystudy (+http://www.yourdomain.com)'
21
22 # Obey robots.txt rules
23 ROBOTSTXT_OBEY = False
24
25 # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 #CONCURRENT_REQUESTS = 32
27
28 # Configure a delay for requests for the same website (default: 0)
29 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
30 # See also autothrottle settings and docs
31 #DOWNLOAD_DELAY = 3
32 # The download delay setting will honor only one of:
33 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 #CONCURRENT_REQUESTS_PER_IP = 16
35
36 # Disable cookies (enabled by default)
37 # COOKIES_ENABLED = True
38
39 # Disable Telnet Console (enabled by default)
40 #TELNETCONSOLE_ENABLED = False
41
42 # Override the default request headers:
43 # DEFAULT_REQUEST_HEADERS = {
44 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 # 'Accept-Language': 'en',
46 # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
47 # }
48
49 # Enable or disable spider middlewares
50 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
51 # SPIDER_MIDDLEWARES = {
52 # 'scrapystudy.middlewares.MyFirstSpiderMiddleware': 543,
53 # }
54
55 # Enable or disable downloader middlewares
56 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
57 # DOWNLOADER_MIDDLEWARES = {
58 # 'scrapystudy.middlewares.SeleniumMiddleware': 543,
59 # }
60
61 # Enable or disable extensions
62 # See https://docs.scrapy.org/en/latest/topics/extensions.html
63 #EXTENSIONS = {
64 # 'scrapy.extensions.telnet.TelnetConsole': None,
65 #}
66
67 # Configure item pipelines
68 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69 # 300,400表示执行顺序,越小代表优先级越高,越先执行
70 ITEM_PIPELINES = {
71 'scrapystudy.pipelines.SaveCDershouFangPipeline': 600,
72 # 'scrapystudy.pipelines.TextPipeline': 300,
73 # 'scrapystudy.pipelines.MongoPipeline': 400,
74 # 'scrapystudy.pipelines.SaveBeikePipeline': 500,
75 }
76
77 # Enable and configure the AutoThrottle extension (disabled by default)
78 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
79 #AUTOTHROTTLE_ENABLED = True
80 # The initial download delay
81 #AUTOTHROTTLE_START_DELAY = 5
82 # The maximum download delay to be set in case of high latencies
83 #AUTOTHROTTLE_MAX_DELAY = 60
84 # The average number of requests Scrapy should be sending in parallel to
85 # each remote server
86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 # Enable showing throttling stats for every response received:
88 #AUTOTHROTTLE_DEBUG = False
89
90 # Enable and configure HTTP caching (disabled by default)
91 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 #HTTPCACHE_ENABLED = True
93 #HTTPCACHE_EXPIRATION_SECS = 0
94 #HTTPCACHE_DIR = 'httpcache'
95 #HTTPCACHE_IGNORE_HTTP_CODES = []
96 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'