前面章节一直在说ES相关知识点,现在是如何实现将爬取到的数据写入到ES中,首先的知道ES的python接口叫elasticsearch dsl
链接:https://github.com/elastic/elasticsearch-dsl-py
什么是elasticsearch dsl:
Elasticsearch DSL是一个高级库,其目的是帮助编写和运行针对Elasticsearch的查询
安装:
pip install elasticsearch-dsl
首先我们在项目文件中新建一个名为models的包,然后在包里新建一个名为es.types.py的文件,用于定义ES的数据的定义
# !/usr/bin/env python # -*- coding:utf-8 -*- from datetime import datetime from elasticsearch_dsl import Date,DocType,Text,Integer,analyzer,Completion,Keyword,Integer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=["localhost"]) class ActicleType(DocType): #伯乐在线文章类型 title = Text(analyzer ="ik_max_word") create_date = Date() url = Keyword() url_object_id = Keyword() front_image_url = Keyword() front_image_path = Keyword() praise_nums = Integer() comment_nums = Integer() fav_nums = Integer() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta: index = "jobbile" doc_type = "article" if __name__=="__main__": ActicleType.init()
然后再items中编写如下文件:
# !/usr/bin/env python # -*- coding:utf-8 -*- from models.es_types import ArticleType def save_to_es(self): artcle = ArticleType() artcle.title = self['title'] artcle.cteate_date = self['cteate_date'] artcle.content = remove_tags(self['content']) artcle.front_image_url = self['front_image'] if "front_image_path" in self: artcle.front_image_path = self['front_image_path'] artcle.praise_nums = self['praise_nums'] artcle.fav_nums = self['fav_nums'] artcle.comment_nums = self['comment_nums'] artcle.url = self['url'] artcle.tags = self['tags'] artcle.meta.id = self['url_object_id'] artcle.save() return
然后再pipeline中编写如下文件:
# !/usr/bin/env python # -*- coding:utf-8 -*- from models.es_types import ArticleType from w3lib.html import remove_tags class ElasticsearchPipeline(object): #将数据写入到es中 def process_item(self,item,spider): #将item转换为es数据 item.save_to_es() return item
最后再settings中编写如下文件:
# !/usr/bin/env python # -*- coding:utf-8 -*- ITEM_PIPELINES = { 'ArticleSpider.pipelines.ElasticsearchPipeline': 300, }